# TF-IDF processing

TF-IDF processing + PCA on those vectors

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn import preprocessing, model_selection, metrics
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_ru')
nltk.download('tagsets')

Load data and stopwords

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df["description"].fillna("NA", inplace=True)
train_df["title"].fillna("NA", inplace=True)
test_df["description"].fillna("NA", inplace=True)
test_df["title"].fillna("NA", inplace=True)

In [None]:
stopwords = set()
import codecs
with codecs.open(('stopwords_ru.txt'), encoding='cp1251') as ins:
    for w in ins:
        word = w.strip("\r\n")
        word = word.strip("\n")
        stopwords.add(word.lower())

Define stemmer

In [None]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("russian")

def stem(s):
    return stemmer.stem(s.lower())

In [None]:
n_svd_components = 30

In [None]:
# chunksize = 10**6
# train_active_titles = []
# train_active_desc = []
# for chunk in pd.read_csv(os.path.join(DATA_PATH, 'train_active.csv'), chunksize = chunksize):
#     chunk["title"].fillna("NA", inplace=True)
#     chunk["description"].fillna("NA", inplace=True)
#     train_active_titles += chunk['title'].values.tolist()
#     train_active_desc += chunk['description'].values.tolist()

In [None]:
# chunksize = 10**6
# test_active_titles = []
# test_active_desc = []
# for chunk in pd.read_csv(os.path.join(DATA_PATH, 'test_active.csv'), chunksize = chunksize):
#     chunk["title"].fillna("NA", inplace=True)
#     chunk["description"].fillna("NA", inplace=True)
#     test_active_titles += chunk['title'].values.tolist()
#     test_active_desc += chunk['description'].values.tolist()

In [None]:
def tfidf_main(train_df, test_df, col_name, n_comp):    
    ### TFIDF Vectorizer ###
    tfidf_vec = TfidfVectorizer(ngram_range=(1,2), preprocessor=stem, stop_words=stopwords)
    full_tfidf = tfidf_vec.fit_transform(train_df[col_name].values.tolist() + test_df[col_name].values.tolist())

    ### SVD Components ###
    svd_obj = TruncatedSVD(n_components=n_comp, algorithm='arpack')
    svd_obj.fit(full_tfidf)

    # Train
    train_tfidf = tfidf_vec.transform(train_df[col_name].values.tolist())
    train_svd = pd.DataFrame(svd_obj.transform(train_tfidf))
    train_svd.columns = ['%s_svd_%s_ngram' % (col_name, i+1) for i in range(n_comp)]
    train_df = pd.concat([train_df, train_svd], axis=1)

    # Test
    test_tfidf = tfidf_vec.transform(test_df[col_name].values.tolist())
    test_svd = pd.DataFrame(svd_obj.transform(test_tfidf))
    test_svd.columns = ['%s_svd_%s_ngram' % (col_name, i+1) for i in range(n_comp)]
    test_df = pd.concat([test_df, test_svd], axis=1)
    
    return train_df, test_df

In [None]:
generated_col_names = []
for col_name in ['title', 'description']:
    train_df, test_df = tfidf_main(train_df, test_df, col_name, n_svd_components)
    generated_col_names += ['%s_svd_%s_ngram' % (col_name, i+1) for i in range(n_svd_components)]

In [None]:
train_df.columns

In [None]:
train_df_save = train_df[generated_col_names]

In [None]:
test_df_save = test_df[generated_col_names]

In [None]:
train_df_save.to_csv(os.path.join(DATA_PATH, 'train_tfidf_uni_bi_grams_data.csv'))

In [None]:
test_df_save.to_csv(os.path.join(DATA_PATH, 'test_tfidf_uni_bi_grams_data.csv'))