### Packages import

In [1]:
# Import base packages
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Import ML packages
from sklearn import preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

### Read train/test datasets
Create a function to read train and test datasets with follow actions:
- Have a look at readme.txt to get more information about the datasets
- Columns should be renamed to 'rate' and 'text'
- Take a random sample of 5000 records for training and test datasets
- Positive labels should be mapped to 0 (instead of 2 in the initial dataset)

In [8]:
def read_format_dataset(dataset_path):
    fulldata = pd.read_csv(dataset_path, header=0, names=['rate', 'text'])
    
    return fulldata.sample(5000)

train_dataset_path = './Data/train.csv'
test_dataset_path = './Data/test.csv'
train_data = read_format_dataset(train_dataset_path)
test_data = read_format_dataset(test_dataset_path)
train_data.head()

Unnamed: 0,rate,text
131923,2,"So, up until tonight, I've primarily used Yelp..."
452109,1,This Pita Jungle had some blemishes... the pit...
549634,1,The only reason this place is worthy of one st...
491197,2,Good bar. Good location. The service was quic...
321865,2,I am remiss in not writing my review of our di...


In [9]:
print(train_data.shape)
print(test_data.shape)

(5000, 2)
(5000, 2)


### Training pipeline
Our pipeline will use:
- TFIDF to vectorize our text
- RandomForest on top of these features

You should first build this sklearn pipeline and use RandomizedSearchCV to get the best parameters for your pipeline. As our training dataset is small, you would probably increase the number of cross validations. Use accuracy as target metric

In [10]:
RANDOM_SEED = 15

In [20]:
# Pipeline definition
tfidf = TfidfVectorizer(
    stop_words=None,
    ngram_range=(1, 1),
    max_df=1.0,
    min_df=1,
    max_features=None,
    norm='l2',
    sublinear_tf=False
)
rfc = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features='auto',
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    min_impurity_split=None,
    bootstrap=True,
    oob_score=False,
    n_jobs=-1,
    random_state=RANDOM_SEED,
    verbose=0,
    warm_start=False,
    class_weight=None,
    ccp_alpha=0.0,
    max_samples=None
)

pipe = Pipeline([
    ('traitement', tfidf),
    ('modele', rfc)
])

In [23]:
# Create the random grid
random_grid = {
    'traitement__stop_words': [None, 'english'],
    #'traitement__norm': ['l1', 'l2'],
    #'traitement__max_features': [None, 100],
    #'traitement__sublinear_tf': [False, True],
    'modele__n_estimators': [50, 100, 200]
}

print(random_grid)

{'traitement__stop_words': [None, 'english'], 'modele__n_estimators': [50, 100, 200]}


In [24]:
# Definition of the random search
random_search = RandomizedSearchCV(
    pipe,
    param_distributions=random_grid,
    n_iter=5,
    n_jobs=-1,
    cv=None,
    verbose=1,
    random_state=RANDOM_SEED
)

# Fit the random search model
random_search.fit(train_data.loc[:, 'text'], train_data.loc[:, 'rate'])

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   34.2s finished


RandomizedSearchCV(cv=None, error_score=nan,
                   estimator=Pipeline(memory=None,
                                      steps=[('traitement',
                                              TfidfVectorizer(analyzer='word',
                                                              binary=False,
                                                              decode_error='strict',
                                                              dtype=<class 'numpy.float64'>,
                                                              encoding='utf-8',
                                                              input='content',
                                                              lowercase=True,
                                                              max_df=1.0,
                                                              max_features=None,
                                                              min_df=1,
                                               

### Performance on test dataset
Compute the accuracy for our test dataset

In [31]:
best_rfc = random_search.best_params_
print(best_rfc)
y_pred = random_search.predict(test_data.loc[:, 'text'])
acc = accuracy_score(test_data.loc[:, 'rate'], y_pred)
print(acc)

{'traitement__stop_words': 'english', 'modele__n_estimators': 200}
0.861
