In [14]:
#basic packages
import numpy as np 
import pandas as pd 
import os
import gc

#natural language processing in Russian
#import nltk
#nltk.download()

#term frequency
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import FeatureUnion
from scipy.sparse import hstack, csr_matrix
from nltk.corpus import stopwords 

#other
from sklearn import preprocessing

In [4]:
#import raw data
training = pd.read_csv('train.csv', index_col = "item_id", parse_dates = ["activation_date"])
testing = pd.read_csv('test.csv', index_col = "item_id", parse_dates = ["activation_date"])
traindex = training.index
testdex = testing.index

In [9]:
#feature engineering:price (training)
training["price"] = np.log(training["price"]+0.001)
training["price"].fillna(-999,inplace=True)
training["image_top_1"].fillna(-999,inplace=True)

training["Weekday"] = training['activation_date'].dt.weekday
training["Weekd of Year"] = training['activation_date'].dt.week
training["Day of Month"] = training['activation_date'].dt.day

#feature engineerin:price (testing)
testing["price"] = np.log(testing["price"]+0.001)
testing["price"].fillna(-999,inplace=True)
testing["image_top_1"].fillna(-999,inplace=True)

testing["Weekday"] = testing['activation_date'].dt.weekday
testing["Weekd of Year"] = testing['activation_date'].dt.week
testing["Day of Month"] = testing['activation_date'].dt.day

  


In [15]:
#create validation index, remove variables we won't use, and encode remaining variables
#training
training_index = training.loc[training.activation_date<=pd.to_datetime('2017-04-07')].index
train_val_index = training.loc[training.activation_date>=pd.to_datetime('2017-04-08')].index
training.drop(["activation_date","image"],axis=1,inplace=True)

categorical = ["user_id","region","city","parent_category_name","category_name","user_type","image_top_1"]
lbl = preprocessing.LabelEncoder()
for col in categorical:
    training[col] = lbl.fit_transform(training[col].astype(str))

#testing
testing_index = testing.loc[testing.activation_date<=pd.to_datetime('2017-04-07')].index
test_val_index = testing.loc[testing.activation_date>=pd.to_datetime('2017-04-08')].index
testing.drop(["activation_date","image"],axis=1,inplace=True)

#categorical = ["user_id","region","city","parent_category_name","category_name","user_type","image_top_1"] already defined above
lbl = preprocessing.LabelEncoder()
for col in categorical:
    testing[col] = lbl.fit_transform(testing[col].astype(str))

In [16]:
#feature engineering: text (training)
training['text_feat'] = training.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features
training.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

#feature engineering: text (testing)
testing['text_feat'] = testing.apply(lambda row: ' '.join([
    str(row['param_1']), 
    str(row['param_2']), 
    str(row['param_3'])]),axis=1) # Group Param Features
testing.drop(["param_1","param_2","param_3"],axis=1,inplace=True)

In [17]:
#meta text features (training)
textfeats = ["description","text_feat", "title"]
for cols in textfeats:
    training[cols] = training[cols].astype(str) 
    training[cols] = training[cols].astype(str).fillna('nicapotato') # FILL NA
    training[cols] = training[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    training[cols + '_num_chars'] = training[cols].apply(len) # Count number of Characters
    training[cols + '_num_words'] = training[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    training[cols + '_num_unique_words'] = training[cols].apply(lambda comment: len(set(w for w in comment.split())))
    training[cols + '_words_vs_unique'] = training[cols+'_num_unique_words'] / training[cols+'_num_words'] * 100 # Count Unique Words
    
#meta text features (testing)
#textfeats = ["description","text_feat", "title"], already defined above
for cols in textfeats:
    testing[cols] = testing[cols].astype(str) 
    testing[cols] = testing[cols].astype(str).fillna('nicapotato') # FILL NA
    testing[cols] = testing[cols].str.lower() # Lowercase all text, so that capitalized words dont get treated differently
    testing[cols + '_num_chars'] = testing[cols].apply(len) # Count number of Characters
    testing[cols + '_num_words'] = testing[cols].apply(lambda comment: len(comment.split())) # Count number of Words
    testing[cols + '_num_unique_words'] = testing[cols].apply(lambda comment: len(set(w for w in comment.split())))
    testing[cols + '_words_vs_unique'] = testing[cols+'_num_unique_words'] / testing[cols+'_num_words'] * 100 # Count Unique Words

In [18]:
#set stopwords
russian_stop = set(stopwords.words('russian'))

#term frequency parameters
tfidf_para = {
    "stop_words": russian_stop,
    "analyzer": 'word',
    "token_pattern": r'\w{1,}',
    "sublinear_tf": True,
    "dtype": np.float32,
    "norm": 'l2',
    #"min_df":5,
    #"max_df":.9,
    "smooth_idf":False
}

#define function to create term frequency variables
def get_col(col_name): return lambda x: x[col_name]

#create vectorized NLP variables
vectorizer = FeatureUnion([
        ('description',TfidfVectorizer(
            ngram_range=(1, 2),
            max_features=16000,
            **tfidf_para,
            preprocessor=get_col('description'))),
        ('text_feat',CountVectorizer(
            ngram_range=(1, 2),
            #max_features=7000,
            preprocessor=get_col('text_feat'))),
        ('title',TfidfVectorizer(
            ngram_range=(1, 2),
            **tfidf_para,
            #max_features=7000,
            preprocessor=get_col('title')))
    ])

In [27]:
training.to_csv('newtrain.csv', index=False, header=True)

In [31]:
testing.to_csv('newtest.csv', index=False, header=True)