In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')
import numpy as np
from scipy.sparse import hstack

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
# Load cleaned and preprocessed data (e.g., from previous steps)
df = pd.read_csv('../../data/clean/clean_data.csv')

In [3]:
# Initialize CountVectorizer
count_vect = CountVectorizer(max_features=5000)
bow_features = count_vect.fit_transform(df['processed_text'])

# Optional: Convert to DataFrame for inspection
bow_df = pd.DataFrame(bow_features.toarray(), columns=count_vect.get_feature_names_out())
print(bow_df.head())

   ab  abdomin  abil  abl  about  aboutbr  abov  abr  absolut  absorb  ...  \
0   0        0     0    0      0        0     0    0        0       0  ...   
1   0        0     0    0      0        0     0    0        0       0  ...   
2   0        0     0    0      0        0     0    0        0       0  ...   
3   0        0     0    0      0        0     0    0        0       0  ...   
4   0        0     0    0      0        0     0    0        0       0  ...   

   zinc  zing  zinger  zip  ziploc  ziplock  ziwipeak  zoe  zone  zuke  
0     0     0       0    0       0        0         0    0     0     0  
1     0     0       0    0       0        0         0    0     0     0  
2     0     0       0    0       0        0         0    0     0     0  
3     0     0       0    0       0        0         0    0     0     0  
4     0     0       0    0       0        0         0    0     0     0  

[5 rows x 5000 columns]


In [4]:


# Initialize TF-IDF Vectorizer
tfidf_vect = TfidfVectorizer(max_features=5000)
tfidf_features = tfidf_vect.fit_transform(df['processed_text'])

# Optional: Convert to DataFrame for inspection
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vect.get_feature_names_out())
print(tfidf_df.head())


    ab  abdomin  abil  abl  about  aboutbr  abov  abr  absolut  absorb  ...  \
0  0.0      0.0   0.0  0.0    0.0      0.0   0.0  0.0      0.0     0.0  ...   
1  0.0      0.0   0.0  0.0    0.0      0.0   0.0  0.0      0.0     0.0  ...   
2  0.0      0.0   0.0  0.0    0.0      0.0   0.0  0.0      0.0     0.0  ...   
3  0.0      0.0   0.0  0.0    0.0      0.0   0.0  0.0      0.0     0.0  ...   
4  0.0      0.0   0.0  0.0    0.0      0.0   0.0  0.0      0.0     0.0  ...   

   zinc  zing  zinger  zip  ziploc  ziplock  ziwipeak  zoe  zone  zuke  
0   0.0   0.0     0.0  0.0     0.0      0.0       0.0  0.0   0.0   0.0  
1   0.0   0.0     0.0  0.0     0.0      0.0       0.0  0.0   0.0   0.0  
2   0.0   0.0     0.0  0.0     0.0      0.0       0.0  0.0   0.0   0.0  
3   0.0   0.0     0.0  0.0     0.0      0.0       0.0  0.0   0.0   0.0  
4   0.0   0.0     0.0  0.0     0.0      0.0       0.0  0.0   0.0   0.0  

[5 rows x 5000 columns]


In [5]:
# Add a feature for the length of each review (number of words)
df['review_length'] = df['processed_text'].apply(lambda x: len(x.split()))


In [6]:
#Using Lexicon 

sia = SentimentIntensityAnalyzer()
df['vader_compound'] = df['processed_text'].apply(lambda x: sia.polarity_scores(x)['compound'])


In [7]:


# Combine TF-IDF features and review length (as an example)
X = hstack([tfidf_features, np.array(df['review_length']).reshape(-1, 1)])

# If you want to add the VADER score as well:
X = hstack([tfidf_features, 
            np.array(df['review_length']).reshape(-1, 1), 
            np.array(df['vader_compound']).reshape(-1, 1)])


In [8]:
import os
os.makedirs('features/processed', exist_ok=True)
# Save the TF-IDF features (as an example)
pd.DataFrame(X.toarray()).to_csv('features/processed/engineered_features.csv', index=False)
