In [1]:
import pandas as pd
import numpy as np
from data_utils import remove_accented_chars
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [2]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [3]:
train = pd.read_csv("proper_cleaned_data/clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv")
train.head()

Unnamed: 0,review_id,title,year,user_review,user_suggestion,developer,publisher,tags,overview
0,1,Spooky's Jump Scare Mansion,2016.0,i am scared and hearing creepy voices so i wil...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
1,2,Spooky's Jump Scare Mansion,2016.0,best game more better than sam peppers youtube...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
2,3,Spooky's Jump Scare Mansion,2016.0,a littly iffy on the controls but once you kno...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
3,4,Spooky's Jump Scare Mansion,2015.0,great game fun and colorful and all that a sid...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...
4,5,Spooky's Jump Scare Mansion,2015.0,not many games have the cute tag right next to...,1,Lag Studios,Lag Studios,Horror Free to Play Cute First-Person Singlepl...,can you survive 1000 rooms of cute terror or w...


In [4]:
print("Original Train File: ", train.shape)
train = train.drop(train[train['user_review'].isna()].index) #Dropping na index from text
empty_idx = train[train['user_review'] == ' '].index
train = train.drop(index=empty_idx)
print("Train File After Dropping na and ' ' empty string: ",train.shape)
user_review = train.user_review.values
idx = []
for i,ur in enumerate(user_review): #Checking whether there are still values which are not string (nan is a float)
    if not isinstance(ur,str):
        idx.append(i)
train = train.drop(index=idx)
print("Train File After Double Check of na and ' ' empty string: ",train.shape)

Original Train File:  (17494, 9)
Train File After Dropping na and ' ' empty string:  (17472, 9)
Train File After Double Check of na and ' ' empty string:  (17472, 9)


In [5]:
train['title'] = train['title'].apply(remove_accented_chars).apply(str.lower).replace(r"[-!':]"," ",regex=True)
train['tags'] = train['tags'].apply(remove_accented_chars).apply(str.lower).replace(r"[-!':]"," ",regex=True)
train['user_review'] = train['user_review'] + " " + train['title'] + " " + train['tags'] 

In [7]:
def count_tfidf_without_stop_removal(train,create_test=True):
    print("Original Train File: ", train.shape)
    train = train.drop(train[train['user_review'].isna()].index) #Dropping na index from text
    empty_idx = train[train['user_review'] == ' '].index
    train = train.drop(index=empty_idx)
    print("Train File After Dropping na and ' ' empty string: ",train.shape)
    user_review = train.user_review.values
    idx = []
    for i,ur in enumerate(user_review): #Checking whether there are still values which are not string (nan is a float)
        if not isinstance(ur,str):
            idx.append(i)
    train = train.drop(index=idx)
    print("Train File After Double Check of na and ' ' empty string: ",train.shape)
    user_review = train.user_review.values #Safe since we have updated train at each step
    docs = nlp.pipe(user_review) #fitting it to the pipeline
    cleaned_text = []
    pbar = tqdm(total=len(user_review))
    for i,doc in enumerate(docs):
        doc_vec = []
        for token in doc:
            doc_vec.append(token.text)
        cleaned_text.append(doc_vec)
        if i%1000 == 0  and i != 0:
            pbar.update(1000)
    pbar.close()
    print("++++++++++ CLEANING DONE +++++++++++++")
    print("+++++++++ ORIGINAL REVIEW +++++++++++")
    print(user_review[0])
    print("+++++++++ TEXT CLEANED ++++++++++++++")
    print(" ".join(cleaned_text[0]))
    corpus = np.array([" ".join(t) for t in cleaned_text])
    if create_test:
        X_train,X_test,y_train,y_test = train_test_split(corpus,train.user_suggestion.values,stratify=train.user_suggestion.values,test_size=0.2)
        return X_train,X_test,y_train,y_test,corpus
    else:
        return corpus,train.user_suggestion.values,corpus

In [13]:
# X_train,X_test,y_train,y_test,train_corpus = count_tfidf_without_stop_removal(train)
X_train,y_train,corpus = count_tfidf_without_stop_removal(train,create_test=False)

  0%|          | 0/17472 [00:00<?, ?it/s]

Original Train File:  (17472, 9)
Train File After Dropping na and ' ' empty string:  (17472, 9)
Train File After Double Check of na and ' ' empty string:  (17472, 9)


 97%|█████████▋| 17000/17472 [04:23<00:07, 64.46it/s]


++++++++++ CLEANING DONE +++++++++++++
+++++++++ ORIGINAL REVIEW +++++++++++
i am scared and hearing creepy voices so i will pause for a moment and write a review while i wait for my heart beat to return to atleast somewhat calmer times this game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood but more bubble and clean hello 1990s what charactes there are that isnot trying to kill me were likable and a bit odd i did do a few noob things though such as oh look a class room full of ghosts from dead children lets shine my flashlight on them and stand there staring at them or hmm creepy music i will turn around and see if i can see what is chasing me never before in a game have i been this afraid of finding a locked door spooky s jump scare mansion horror free to play cute first person singleplayer psychological horror indie adventure dark funny atmospheric action walking simulator survival survival horror anime gore comedy multiplayer illumina

In [14]:
test = pd.read_csv("proper_cleaned_data/test_clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv")
test['title'] = test['title'].apply(remove_accented_chars).apply(str.lower).replace(r"[-!':]"," ",regex=True)
test['tags'] = test['tags'].apply(remove_accented_chars).apply(str.lower).replace(r"[-!':]"," ",regex=True)
test['user_review'] = test['user_review'] + " " + test['title'] + " " + test['tags'] 

In [15]:
def test_cleaning_without_stop_removal(test):
    nan_idx = test[test['user_review'].isna()].index
    test.loc[nan_idx,'user_review'] = test.loc[nan_idx,'title']
    test_user_review = test.user_review
    for i,ur in enumerate(test_user_review):
        if not isinstance(ur,str):
            print(i)
    test_data = []
    test_docs = nlp.pipe(test_user_review)
    pbar = tqdm(total=len(test_user_review))
    for i,doc in enumerate(test_docs):
        doc_vec = []
        for token in doc:
            doc_vec.append(token.text)
        test_data.append(doc_vec)
        if i%1000 == 0  and i != 0:
            pbar.update(1000)
    pbar.close()
    test_corpus = np.array([" ".join(t) for t in test_data])
    return test_corpus,test_corpus

In [16]:
X_test_sub,test_corpus = test_cleaning_without_stop_removal(test)

 99%|█████████▉| 8000/8045 [02:30<00:00, 53.32it/s]


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from xgboost import XGBClassifier

In [17]:
cv = CountVectorizer(ngram_range=(1,2))
corpus = np.concatenate((corpus,test_corpus))
full_corpus = cv.fit_transform(corpus)
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test_sub)
lr = LogisticRegression(max_iter=800)
lr.fit(X_train_cv,y_train)
y_pred = lr.predict(X_test_cv)
#Highest submission 85 with logistic regression

In [20]:
sub = pd.DataFrame()
sub['review_id'] = test.review_id
sub['user_suggestion'] = y_pred
sub.to_csv("submission_tfidf_stop.csv",index=False)