In [None]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
id = '1AQ6VrAnE1xGQRM_OPPgA9dsxXlPSC-SW'
file = drive.CreateFile({'id':id}) 
file.GetContentFile('bumble_hinge_review.csv')

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from tqdm import tqdm

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from scipy.sparse import hstack

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk import punkt
nltk.download('stopwords')
nltk.download('punkt')
stop_words = stopwords.words('english')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


# Loading in Dataset

In [None]:
df=pd.read_csv('/content/drive/MyDrive/DSO 560 NLP Team Project/clean_data/bumble_hinge_review.csv', index_col=False)
df.head()

Unnamed: 0,Name,Review,Rating,#ThumbsUp,Date&Time,App,language
0,Khy McCabe,"can not seem to log into account , i have try ...",1,0,18-02-2022 00:53,Bumble,en
1,rob cif,limited view without pay money . 90 % scam acc...,1,1,18-02-2022 00:53,Bumble,en
2,abhishek bhatia,this be the most trusted but bad app . this ap...,1,0,18-02-2022 00:25,Bumble,en
3,Myles Grothaus,just a money grab . i use to love this app bef...,1,0,17-02-2022 23:55,Bumble,en
4,David Barak,the app be `` crapp . '' if i switch away from...,1,0,17-02-2022 23:55,Bumble,en


In [None]:
df.loc[(df['#ThumbsUp']<=1)&(df.App=='Bumble'),'label']=0
df.loc[(df['#ThumbsUp']>1)&(df.App=='Bumble'),'label']=1
df_1=df.loc[(df['label']==1)&(df.App=='Bumble')]
df_0=df.loc[(df['label']==0)&(df.App=='Bumble')]

In [None]:
df_0.shape

(63635, 8)

In [None]:
df_1.shape

(15258, 8)

In [None]:
# downsampling
from random import sample
import random

random.seed(0)
df_0 = df_0.sample(n=15258,random_state=0)
df=pd.concat([df_1,df_0])
df.shape


In [None]:
df=df.reset_index(drop=True)

In [None]:
y=df['label']

In [None]:
doc=df['Review']

# Word2Vec Embedding

## Gensim Doc2Vec Embedding

In [None]:
%time
# train embedding model
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(doc)]
model = Doc2Vec(documents,vector_size=100, window=4, min_count=1, workers=4)
# model.build_vocab(documents)
model.train(documents,total_examples=model.corpus_count,epochs=20)

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 9.3 µs


In [None]:
%%time
# Document Embedding
x=np.empty(shape=[0,100])
for i in range(len(doc)):
    doc_vector=model.infer_vector(doc[i]).reshape(1, -1)
    x=np.vstack((x,doc_vector))
x=pd.DataFrame(x)


CPU times: user 3min 16s, sys: 5.42 s, total: 3min 22s
Wall time: 3min 21s


# Split Into Train/Test

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2,stratify=y)

# Model Tunning

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
def tunning_model(X_train, X_test, y_train, y_test,model):
    model.fit(X_train,y_train)
    y_train_pred= model.predict(X_train)
    y_test_pred=model.predict(X_test)
    test_score=accuracy_score(y_test, y_test_pred)
    train_score=accuracy_score(y_train, y_train_pred)
    print('train: ',train_score)
    print('test: ',test_score)
    print("y_test_pred:\n",pd.Series(y_test_pred).value_counts())
    print("y_test:\n",pd.Series(y_test).value_counts())
    print('confusion matrix: ',confusion_matrix(y_test, y_test_pred))
    print( "roc acu: ",roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]))

In [None]:
%%time
lr = LogisticRegression()
tunning_model( X_train, X_test, y_train, y_test,lr)

train:  0.6013436015074554
test:  0.6002621231979031
y_test_pred:
 0.0    3278
1.0    2826
dtype: int64
y_test:
 0.0    3052
1.0    3052
Name: label, dtype: int64
confusion matrix:  [[1945 1107]
 [1333 1719]]
roc acu:  0.6272843452674395
CPU times: user 1.6 s, sys: 729 ms, total: 2.33 s
Wall time: 1.85 s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [None]:
%%time
# Linear regression
lr = LogisticRegression()
tunning_model( X_train, X_test, y_train, y_test,lr)

In [None]:
%%time
# Random Forest
params={'n_estimators':100, 'max_depth':10,'min_samples_leaf':30,"n_jobs":-1}
rf = RandomForestClassifier(**params)
tunning_model( X_train, X_test, y_train, y_test,rf)

train:  0.7436097001474684
test:  0.6915137614678899
y_test_pred:
 1.0    3213
0.0    2891
dtype: int64
y_test:
 0.0    3052
1.0    3052
Name: label, dtype: int64
confusion matrix:  [[2030 1022]
 [ 861 2191]]
roc acu:  0.7525491953367494
CPU times: user 26.9 s, sys: 0 ns, total: 26.9 s
Wall time: 14.2 s


In [None]:
# Boosted tree
params={'n_estimators': 50, 'max_depth': 20, 'min_samples_split': 50, 'learning_rate': 0.01, 'min_samples_leaf': 1}
bt = GradientBoostingClassifier(**params)
tunning_model( X_train, X_test, y_train, y_test,bt)

train:  0.9412993609700148
test:  0.6805373525557011
y_test_pred:
 1.0    3166
0.0    2938
dtype: int64
y_test:
 0.0    3052
1.0    3052
Name: label, dtype: int64
confusion matrix:  [[2020 1032]
 [ 918 2134]]
roc acu:  0.7296425093057171
