# Run `feature_extraction.py` and `train_test_split.ipynb` first!
(in respective order)

# Creating TF-IDF Matrix
And combining with X_train, X_test, and submission

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
X_train = pd.read_csv("data/components/X_train.csv")
y_train = pd.read_csv("data/components/y_train.csv")
X_test = pd.read_csv("data/components/X_test.csv")
y_test = pd.read_csv("data/components/y_test.csv")
submission = pd.read_csv("data/components/submission.csv")

In [10]:
# Make sure there are no NaNs
X_train['Summary'] = X_train['Summary'].fillna("")
X_train['LemmatizedSummary'] = X_train['LemmatizedSummary'].fillna("")
X_test['Summary'] = X_test['Summary'].fillna("")
X_test['LemmatizedSummary'] = X_test['LemmatizedSummary'].fillna("")
submission['Summary'] = submission['Summary'].fillna("")
submission['LemmatizedSummary'] = submission['LemmatizedSummary'].fillna("")

X_test.head(2)

Unnamed: 0,ProductId,UserId,Summary,Text,Score,Month,Year,Helpful,Unhelpful,ExclaimationCount,...,TextCleaned,UniqueWords,LemmatizedSummary,LemmatizedCleanedText,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore,ProductPopularity,UserPopularity
0,B001VPJYZK,A1XKXV4BSW11TF,for my wife,"Thank you for having this item, I live on Guam...",5.0,8,2010,0,0,1,...,thank item live guam find item wife checked am...,0.857143,for my wife,thank item live guam find item wife checked am...,0.0,0.7269,3.448276,5.0,29,4
1,B000MTFDDI,A1374RLDD5VINW,no pink panther,there isn't one pink panther episode on the wh...,1.0,1,2013,0,5,1,...,pink panther episode whole dvd buy pink panthe...,0.76,no pink panther,pink panther episode whole dvd buy pink panthe...,-0.296,0.0,4.909091,4.2,11,10


In [42]:
# Don't include words frequent as 55%+ and as infrequent as 1% of text
tfidf_vectorizer = TfidfVectorizer(max_df = 0.55, min_df = 0.01, ngram_range=(1,3))

In [43]:
combined_text_train = X_train['TextCleaned'] + " " + X_train['Summary']
X_train_tfidf_full = tfidf_vectorizer.fit_transform(combined_text_train)

In [44]:
combined_text_test = X_test['TextCleaned'] + " " + X_test['Summary']
X_test_tfidf_full = tfidf_vectorizer.transform(combined_text_test)

In [45]:
combined_text_submission = submission['TextCleaned'] + " " + submission['Summary']
submission_tfidf_full = tfidf_vectorizer.transform(combined_text_submission)

In [46]:
X_train_tfidf_full.shape

(1188268, 1262)

In [56]:
def combine_tfidf(X, tfidf):
    X_tfidf = pd.DataFrame(tfidf.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

    X_tfidf.reset_index(drop=True, inplace=True)
    X.reset_index(drop=True, inplace=True)
    
    X_combined_df = pd.concat([X_tfidf, X], axis=1)

    return X_combined_df

In [57]:
columns_to_drop = ['ProductId', 'UserId', 'Summary', 'Text', 'LemmatizedSummary', 'LemmatizedCleanedText', 'TextCleaned','Score']

In [58]:
X_train_combined = combine_tfidf(X_train, X_train_tfidf_full)
X_train_combined.head(2)

Unnamed: 0,ability,able,absolutely,across,act,acted,acting,action,actor,actors,...,TextCleaned,UniqueWords,LemmatizedSummary,LemmatizedCleanedText,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore,ProductPopularity,UserPopularity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,believe parents problem scary gore nailbiter s...,0.670103,a horror classic,believe parent problem scary gore nailbiter sa...,-0.5719,-0.8781,3.2,3.781513,50,119
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,blind fury good simple could omega doom promis...,0.851064,blind fury is a good movie but omega doom left...,blind fury good simple could omega doom promis...,-0.8151,-0.8834,2.5,2.916667,8,12


In [59]:
X_test_combined = combine_tfidf(X_test, X_test_tfidf_full)
X_test_combined.head(2)

Unnamed: 0,ability,able,absolutely,across,act,acted,acting,action,actor,actors,...,TextCleaned,UniqueWords,LemmatizedSummary,LemmatizedCleanedText,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore,ProductPopularity,UserPopularity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,thank item live guam find item wife checked am...,0.857143,for my wife,thank item live guam find item wife checked am...,0.0,0.7269,3.448276,5.0,29,4
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,pink panther episode whole dvd buy pink panthe...,0.76,no pink panther,pink panther episode whole dvd buy pink panthe...,-0.296,0.0,4.909091,4.2,11,10


In [60]:
submission_combined = combine_tfidf(submission, submission_tfidf_full)
submission_combined.head(2)

Unnamed: 0,ability,able,absolutely,across,act,acted,acting,action,actor,actors,...,TextCleaned,UniqueWords,LemmatizedSummary,LemmatizedCleanedText,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore,ProductPopularity,UserPopularity
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.226498,...,alright people saying ensemble cast misleading...,0.650888,okay for a rental,alright people saying ensemble cast misleading...,0.2263,0.9866,3.478548,2.545455,303,22
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,kids love exciting fun watch well written kids...,0.95,great for kid,kid love exciting fun watch well written kid w...,0.6249,0.9153,4.182927,4.214286,82,14


In [33]:
X_train_combined = X_train_combined.drop(columns=columns_to_drop, axis=1)
X_test_combined = X_test_combined.drop(columns=columns_to_drop, axis=1)
submission_combined = submission_combined.drop(columns=columns_to_drop, axis=1)

In [63]:
submission_combined.shape, X_test_combined.shape, X_train_combined.shape

((212192, 1287), (297067, 1285), (1188268, 1285))

## Save files

In [69]:
X_train_combined.to_csv("data/components/tfidf/X_train_full.csv", index=False, header=True)
X_test_combined.to_csv("data/components/tfidf/X_test_full.csv", index=False, header=True)
submission_combined.to_csv("data/components/tfidf/submission_full.csv", index=False, header=True)

## GloVe Experiments
(this didn't work out, so ignore this section please)

In [48]:
import numpy as np 

def load_glove_embeddings(file_path):
    embeddings = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings('experiments/glove.6B.300d.txt')

In [51]:
def get_sentence_vector(words, glove_embeddings):
    vectors = [glove_embeddings.get(word) for word in words if word in glove_embeddings]
    return np.mean(vectors, axis=0) if vectors else np.zeros(300)

In [57]:
X_train.shape

(1188268, 15)

In [55]:
X_train['SentenceVector'] = X_train['LemmatizedCleanedText'].apply(lambda x: get_sentence_vector(x, glove_embeddings))

In [58]:
X_test['SentenceVector'] = X_test['LemmatizedCleanedText'].apply(lambda x: get_sentence_vector(x, glove_embeddings))

In [65]:
X_train[:500_000][['SummarySentiment', 'CleanedTextSentiment', 'ProductAvgScore', 'UserAvgScore', 'SentenceVector']]

Unnamed: 0,SummarySentiment,CleanedTextSentiment,ProductAvgScore,UserAvgScore,SentenceVector
0,-0.5719,-0.8781,3.200000,3.781513,"[-0.2892621, 0.075449675, -0.28249758, -0.0367..."
1,-0.8151,-0.8834,2.500000,2.916667,"[-0.25809747, 0.07172786, -0.29970953, -0.0479..."
2,0.0000,0.6808,3.750000,1.772727,"[-0.26813552, 0.016843056, -0.3056523, -0.0527..."
3,0.8271,0.9081,3.750000,3.888889,"[-0.31486198, 0.0812924, -0.2991736, -0.060852..."
4,0.2732,-0.1027,3.690909,2.666667,"[-0.26775372, 0.08156801, -0.3879338, -0.07389..."
...,...,...,...,...,...
499995,-0.1779,0.8271,3.416667,4.833333,"[-0.27551064, 0.040809095, -0.32630408, -0.052..."
499996,0.6369,0.9578,4.175115,3.500000,"[-0.1864752, 0.03579887, -0.32041645, -0.11680..."
499997,0.7003,-0.7430,3.414239,3.200000,"[-0.24792093, 0.035204932, -0.27055123, -0.069..."
499998,0.0000,0.9432,4.153846,3.769231,"[-0.27007738, 0.034698863, -0.27886388, -0.063..."
