# General imports.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from text_processing import cleaning_string, cleaning_data

pd.options.display.max_colwidth = 200

# Data.

In [2]:
df = pd.read_csv("data/final_sub.csv")
#df = pd.read_csv("data/final_full.csv")
df.shape

(5000, 3)

In [3]:
df_train = pd.read_csv('data/final_train_sub.csv')
#df_train = pd.read_csv('data/final_train_full.csv')
print(df_train.shape)

df_test = pd.read_csv('data/final_test_sub.csv')
#df_test = pd.read_csv('data/final_test_full.csv')
print(df_test.shape)

(4000, 3)
(1000, 3)


In [4]:
df_train.groupby('label').count()

Unnamed: 0_level_0,Unnamed: 0,reviewText
label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,1914,1914
1,2086,2086


In [5]:
print(df_train.shape[0] + df_test.shape[0])

5000


In [6]:
df1_train = cleaning_data(df_train, 'reviewText')
df1_test = cleaning_data(df_test, 'reviewText')
df1_train.head()

Unnamed: 0.1,Unnamed: 0,label,reviewText,clean_text
0,4227,1,"Wow, just finished reading this book it was different and interesting but at the same time passionable and great love.",wow just finish read book differ interest time passion great love
1,4676,0,How many stereotypes can be crammed into 1 short story. I got this since it appeared to be a modern continuation of &#34;To Ride a Puka&#34; I highly recommend that book. But don't bother with thi...,mani stereotyp can cram short stori got sinc appear modern continu ride puka high recommend book don t bother one
2,800,1,OMG. I can't get enough of this series. Just love Taron and Ivy. I love the Walker brothers. Can't wait to see whats happens next.,omg can t get enough seri just love taron ivi love walker brother can t wait see what happen next
3,3671,1,Wow! You will need a tissue when you read this. Poor Alex has so many things happen to her in her 30 years. The bulk of the story takes place while she is 15. Fast forward to present day with a se...,wow will need tissu read poor alex mani thing happen year bulk stori take place fast forward present day second chanc love one deserv happi much alex sweet read romanc hot scene book work
4,4193,1,"Set in LA, there are hints of LA Confidential along with a quirky heroine and a solid mystery. ""Kenny"" Rubin (the name is short for a name she hates) is a woman in a tough town. She also writes fo...",set la hint la confidenti along quirki heroin solid mysteri kenni rubin name short name hate woman tough town also write magazin s fold s cover five year old mysteri troubl crop secret stay buri s...


In [7]:
X_train = df1_train['clean_text']
X_test = df1_test['clean_text']

y_train = df1_train['label']
y_test = df1_test['label']

# Features. Part 1: Train

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, accuracy_score

  from numpy.core.umath_tests import inner1d


In [9]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=2, use_idf=True)
train_tfidf = tfidf_vectorizer.fit_transform(X_train.tolist())
test_tfidf = tfidf_vectorizer.transform(X_test.tolist())

In [10]:
#train_tfidf

In [11]:
model = GradientBoostingClassifier(n_estimators=100, random_state=42, learning_rate=0.1)
model.fit(train_tfidf, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=42, subsample=1.0, verbose=0,
              warm_start=False)

In [12]:
print(confusion_matrix(y_train, model.predict(train_tfidf)))

[[1775  139]
 [ 207 1879]]


In [13]:
print(confusion_matrix(y_test, model.predict(test_tfidf)))
print(classification_report(y_test, model.predict(test_tfidf)))
print(accuracy_score(y_test, model.predict(test_tfidf)))

[[411  67]
 [ 85 437]]
             precision    recall  f1-score   support

          0       0.83      0.86      0.84       478
          1       0.87      0.84      0.85       522

avg / total       0.85      0.85      0.85      1000

0.848


# Diagnosis

In [14]:
df1_test['predict_label'] =  model.predict(test_tfidf)
df1_test['predict'] =  model.predict_proba(test_tfidf)
df1_test.head()

ValueError: Wrong number of items passed 2, placement implies 1

In [None]:
df1_test['error'] = abs(df1_test['label'] - df1_test['predict_label'])

In [None]:
df_temp = df1_test[['reviewText','clean_text','label', 'predict_label', 'predict']][(df1_test['error'] == 1) & (df1_test['predict_label'] == 0)].sort_values(by='predict', ascending = False).head()

In [None]:
df_temp.reviewText.iloc[0]

In [None]:
df_temp.clean_text.iloc[0]

### Very right 

In [None]:
df_temp = df1_test[['reviewText','clean_text','label', 'predict_label', 'predict']][(df1_test['error'] == 0) & (df1_test['predict_label'] == 1)].sort_values(by='predict', ascending = True)
df_temp.head()

In [None]:
df_temp['reviewText'].iloc[0]

df_final = X_test.to_frame()
df_final['label'] = y_test
df_final['predict'] =  model.predict(test_tfidf)
df_final.head()

In [None]:
#df_final.to_csv('data/ml_results.csv')

In [None]:
df1_test[['reviewText','label', 'predict_label']].to_csv('data/final_ml_results.csv')