In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer
from tqdm import tqdm
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [3]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [4]:
def count_tfidf_without_stop_removal(train,create_test=True):
    print("Original Train File: ", train.shape)
    train = train.drop(train[train['user_review'].isna()].index) #Dropping na index from text
    empty_idx = train[train['user_review'] == ' '].index
    train = train.drop(index=empty_idx)
    print("Train File After Dropping na and ' ' empty string: ",train.shape)
    user_review = train.user_review.values
    idx = []
    for i,ur in enumerate(user_review): #Checking whether there are still values which are not string (nan is a float)
        if not isinstance(ur,str):
            idx.append(i)
    train = train.drop(index=idx)
    print("Train File After Double Check of na and ' ' empty string: ",train.shape)
    user_review = train.user_review.values #Safe since we have updated train at each step
    docs = nlp.pipe(user_review) #fitting it to the pipeline
    cleaned_text = []
    pbar = tqdm(total=len(user_review))
    for i,doc in enumerate(docs):
        doc_vec = []
        for token in doc:
            doc_vec.append(token.text)
        cleaned_text.append(doc_vec)
        if i%1000 == 0  and i != 0:
            pbar.update(1000)
    pbar.close()
    print("++++++++++ CLEANING DONE +++++++++++++")
    print("+++++++++ ORIGINAL REVIEW +++++++++++")
    print(user_review[0])
    print("+++++++++ TEXT CLEANED ++++++++++++++")
    print(" ".join(cleaned_text[0]))
    corpus = np.array([" ".join(t) for t in cleaned_text])
    if create_test:
        X_train,X_test,y_train,y_test = train_test_split(corpus,train.user_suggestion.values,stratify=train.user_suggestion.values,test_size=0.2)
        return X_train,X_test,y_train,y_test,corpus
    else:
        return corpus,train.user_suggestion.values,corpus

In [5]:
def test_cleaning_without_stop_removal(test):
    nan_idx = test[test['user_review'].isna()].index
    test.loc[nan_idx,'user_review'] = test.loc[nan_idx,'title']
    test_user_review = test.user_review
    for i,ur in enumerate(test_user_review):
        if not isinstance(ur,str):
            print(i)
    test_data = []
    test_docs = nlp.pipe(test_user_review)
    pbar = tqdm(total=len(test_user_review))
    for i,doc in enumerate(test_docs):
        doc_vec = []
        for token in doc:
            doc_vec.append(token.text)
        test_data.append(doc_vec)
        if i%1000 == 0  and i != 0:
            pbar.update(1000)
    pbar.close()
    test_corpus = np.array([" ".join(t) for t in test_data])
    return test_corpus,test_corpus

In [13]:
train = pd.read_csv("proper_cleaned_data/clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv")
X_train,X_test,y_train,y_test,train_corpus = count_tfidf_without_stop_removal(train)
# train = pd.read_csv("proper_cleaned_data/clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv")
# X_train,y_train,train_corpus = count_tfidf_without_stop_removal(train,create_test=False)


  0%|          | 0/17472 [00:00<?, ?it/s][A

Original Train File:  (17494, 9)
Train File After Dropping na and ' ' empty string:  (17472, 9)
Train File After Double Check of na and ' ' empty string:  (17472, 9)



  6%|▌         | 1000/17472 [00:22<06:09, 44.63it/s][A
  6%|▌         | 1000/17472 [00:34<06:09, 44.63it/s][A
 11%|█▏        | 2000/17472 [00:40<05:27, 47.27it/s][A
 11%|█▏        | 2000/17472 [00:54<05:27, 47.27it/s][A
 17%|█▋        | 3000/17472 [00:57<04:48, 50.09it/s][A
 17%|█▋        | 3000/17472 [01:14<04:48, 50.09it/s][A
 23%|██▎       | 4000/17472 [01:14<04:15, 52.64it/s][A
 29%|██▊       | 5000/17472 [01:28<03:36, 57.67it/s][A
 29%|██▊       | 5000/17472 [01:44<03:36, 57.67it/s][A
 34%|███▍      | 6000/17472 [01:46<03:21, 56.90it/s][A
 40%|████      | 7000/17472 [02:02<02:59, 58.36it/s][A
 40%|████      | 7000/17472 [02:14<02:59, 58.36it/s][A
 46%|████▌     | 8000/17472 [02:16<02:34, 61.30it/s][A
 52%|█████▏    | 9000/17472 [02:32<02:15, 62.34it/s][A
 52%|█████▏    | 9000/17472 [02:44<02:15, 62.34it/s][A
 57%|█████▋    | 10000/17472 [02:49<02:02, 61.22it/s][A
 57%|█████▋    | 10000/17472 [03:04<02:02, 61.22it/s][A
 63%|██████▎   | 11000/17472 [03:05<01:46, 60

++++++++++ CLEANING DONE +++++++++++++
+++++++++ ORIGINAL REVIEW +++++++++++
i am scared and hearing creepy voices so i will pause for a moment and write a review while i wait for my heart beat to return to atleast somewhat calmer times this game is adorable and creepy like my happy tree friends but with the graphics sceme of my childhood but more bubble and clean hello 1990s what charactes there are that isnot trying to kill me were likable and a bit odd i did do a few noob things though such as oh look a class room full of ghosts from dead children lets shine my flashlight on them and stand there staring at them or hmm creepy music i will turn around and see if i can see what is chasing me never before in a game have i been this afraid of finding a locked door
+++++++++ TEXT CLEANED ++++++++++++++
i am scared and hearing creepy voices so i will pause for a moment and write a review while i wait for my heart beat to return to atleast somewhat calmer times this game is adorable and cre

In [None]:
test = pd.read_csv("proper_cleaned_data/test_clean_review_tags_overview_notremoved_stopwords_frequent_nonfrequent_words.csv")
X_test_sub,test_corpus = test_cleaning_without_stop_removal(test)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,TfidfTransformer

In [10]:
def local_test_count(model,corpus,X_train,y_train,X_test,y_test="Not passed"):
    cv = CountVectorizer(ngram_range=(1,2))
    full_corpus = cv.fit_transform(corpus)
    X_train_cv = cv.transform(X_train)
    X_test_cv = cv.transform(X_test)
    model.fit(X_train_cv,y_train)
    y_pred = model.predict(X_test_cv)
    if y_test != "Not passed":
        print(f1_score(y_test,y_pred))
    else:
        sub = pd.DataFrame()
        sub['review_id'] = test.review_id
        sub['user_suggestion'] = y_pred
        return sub

In [15]:
def local_test_count_tfidf(model,corpus,X_train,y_train,X_test,y_test="Not passed"):
    cv = CountVectorizer(ngram_range=(1,2))
    full_corpus = cv.fit_transform(corpus)
    tf = TfidfTransformer()
    tf.fit(full_corpus)
    X_train_cv = cv.transform(X_train)
    X_test_cv = cv.transform(X_test)
    X_train_cv = tf.transform(X_train_cv)
    X_test_cv = tf.transform(X_test_cv)
    model.fit(X_train_cv,y_train)
    y_pred = model.predict(X_test_cv)
    if y_test != "Not passed":
        print(f1_score(y_test,y_pred))
    else:
        sub = pd.DataFrame()
        sub['review_id'] = test.review_id
        sub['user_suggestion'] = y_pred
        return sub

In [None]:
svm_final = SVC(kernel='linear',C=0.01)
corpus = np.concatenate((corpus,test_corpus))
sub = local_test_count(svm_final,corpus,X_train,y_train,X_test_sub)

In [16]:
svm_final = SVC(kernel='linear', C=0.01)
local_test_count_tfidf(svm_final,train_corpus,X_train,y_train,X_test,y_test)

0.7258476121035362


  if sys.path[0] == '':


In [None]:
sub.to_csv("svm_linear_0.01.csv",index=False)

In [None]:
svm_linear = SVC(kernel='linear')
local_test(svm_linear,train_corpus,X_train,y_train,X_test,y_test)

In [None]:
svm_linear = SVC(kernel='linear',C=0.1)
local_test(svm_linear,train_corpus,X_train,y_train,X_test,y_test)

In [None]:
svm_linear = SVC(kernel='linear',C=10)
local_test(svm_linear,train_corpus,X_train,y_train,X_test,y_test)

In [None]:
svm_linear = SVC(kernel='linear',C=100)
local_test(svm_linear,train_corpus,X_train,y_train,X_test,y_test)

In [None]:
svm_linear = SVC(kernel='linear',C=0.01)
local_test(svm_linear,train_corpus,X_train,y_train,X_test,y_test)

In [None]:
svm_linear = SVC(kernel='linear',C=0.001)
local_test(svm_linear,train_corpus,X_train,y_train,X_test,y_test)

In [None]:
# cv = CountVectorizer(ngram_range=(1,2))
# full_corpus = cv.fit_transform(train_corpus)
# X_train_cv = cv.transform(X_train)
# X_test_cv = cv.transform(X_test)
# lr = LogisticRegression(max_iter=800)
# lr.fit(X_train_cv,y_train)
# y_pred = lr.predict(X_test_cv)
# f1_score(y_test,y_pred)

In [None]:
# cv = CountVectorizer(ngram_range=(1,2))
# full_corpus = cv.fit_transform(train_corpus)
# X_train_cv = cv.transform(X_train)
# X_test_cv = cv.transform(X_test)
# lr = MultinomialNB()
# lr.fit(X_train_cv,y_train)
# y_pred = lr.predict(X_test_cv)
# f1_score(y_test,y_pred)

In [None]:
# cv = CountVectorizer(ngram_range=(1,2))
# #corpus = np.concatenate((corpus,test_corpus))
# full_corpus = cv.fit_transform(train_corpus)
# X_train_cv = cv.transform(X_train)
# X_test_cv = cv.transform(X_test)

In [None]:
# tf = TfidfTransformer()
# tf.fit(full_corpus)
# X_train_cv = tf.transform(X_train_cv)
# X_test_cv = tf.transform(X_test_cv)
# lr = LogisticRegression(max_iter=800)
# lr.fit(X_train_cv,y_train)
# y_pred = lr.predict(X_test_cv)
# f1_score(y_test,y_pred)

In [None]:
# cv = CountVectorizer(ngram_range=(1,2))
# corpus = np.concatenate((corpus,test_corpus))
# full_corpus = cv.fit_transform(corpus)
# X_train_cv = cv.transform(X_train)
# X_test_cv = cv.transform(X_test_sub)
# lr = LogisticRegression(max_iter=800)
# lr.fit(X_train_cv,y_train)
# y_pred = lr.predict(X_test_cv)
# #Highest submission 85 with logistic regression

In [None]:
# cv = CountVectorizer(ngram_range=(1,2))
# #corpus = np.concatenate((corpus,test_corpus))
# full_corpus = cv.fit_transform(train_corpus)
# X_train_cv = cv.transform(X_train)
# X_test_cv = cv.transform(X_test)
# xgb = XGBClassifier()
# xgb.fit(X_train_cv,y_train)
# y_pred = xgb.predict(X_test_cv)
# f1_score(y_test,y_pred)

In [None]:
sub = pd.DataFrame()
sub['review_id'] = test.review_id
sub['user_suggestion'] = y_pred

In [None]:
sub.to_csv("submission_tfidf_stop.csv",index=False)

In [None]:
len(y_pred)