In [None]:
pip install -U sentence-transformers

Collecting sentence-transformers
[?25l  Downloading https://files.pythonhosted.org/packages/78/e0/65ad8fd86eba720412d9ff102c4a3540a113bbc6cd29b01e7ecc33ebb1fa/sentence-transformers-0.3.2.tar.gz (65kB)
[K     |████████████████████████████████| 71kB 4.9MB/s 
[?25hCollecting transformers>=3.0.2
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 16.7MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 14.6MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-ma

In [None]:
#Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report
from pprint import pprint
from sklearn.model_selection import GridSearchCV
from typing import List
import pickle

In [None]:
#Import dataset
df = pd.read_csv("labelled_sentiments.csv")

In [None]:
#Split dataset into training and testing data
train, test = train_test_split(df, 
                               test_size = .2, 
                               stratify=df["label"], 
                               random_state=1988)

In [None]:
#Encode X_train Inputs
sentencemodel = SentenceTransformer('roberta-base-nli-mean-tokens')

X_train = sentencemodel.encode(train["text"].tolist())
y_train = train["label"]
print("finished encoding X_train inputs")

100%|██████████| 459M/459M [00:09<00:00, 48.7MB/s]


finished encoding X_train inputs


In [None]:
# Train Classifier
rfclf = RandomForestClassifier(verbose=True, n_jobs=-1)
rfclf.fit(X_train, y_train)
print(f"training accuracy: {rfclf.score(X_train, y_train)}")

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   58.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.6s


training accuracy: 0.9937175777482339


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    1.3s finished


In [None]:
#Evaluate against Test Data
X_test = sentencemodel.encode(test["text"].tolist())
y_test = test["label"]
print(f"Test accuracy: {rfclf.score(X_test, y_test)}")

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s


Test accuracy: 0.9607965801272026


[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.3s finished


In [None]:
#Results
preds = rfclf.predict(X_test)
print(classification_report(y_test, preds))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s


              precision    recall  f1-score   support

          -1       0.90      0.84      0.87       733
           0       0.96      0.98      0.97      6677
           1       0.98      0.93      0.96      2181

    accuracy                           0.96      9591
   macro avg       0.95      0.92      0.93      9591
weighted avg       0.96      0.96      0.96      9591



[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.3s finished


In [None]:
#Review current parameters
print('Parameters currently in use:\n')
pprint(rfclf.get_params())

Parameters currently in use:

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'oob_score': False,
 'random_state': None,
 'verbose': True,
 'warm_start': False}


In [None]:
#Grid Search
parameters = {'criterion':['gini','entropy'],
              'min_samples_leaf':[1,2]}
grid = GridSearchCV(rfclf, parameters, cv=3, n_jobs=-1, verbose=True)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed: 16.8min finished
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=-1,
                                              oob_score=False,
                                              random_

In [None]:
#Best Parameters
grid.best_params_

{'criterion': 'entropy', 'min_samples_leaf': 2}

In [None]:
#Gridsearch Results
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_min_samples_leaf,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,164.22335,3.102275,0.871905,0.094949,gini,1,"{'criterion': 'gini', 'min_samples_leaf': 1}",0.921717,0.913819,0.913037,0.916191,0.003921,2
1,147.503776,1.328198,0.79255,0.080288,gini,2,"{'criterion': 'gini', 'min_samples_leaf': 2}",0.917729,0.90983,0.912255,0.913271,0.003304,4
2,174.177743,1.842895,0.698964,0.004772,entropy,1,"{'criterion': 'entropy', 'min_samples_leaf': 1}",0.920231,0.913897,0.914444,0.916191,0.002866,2
3,168.834847,2.157618,0.582345,0.20324,entropy,2,"{'criterion': 'entropy', 'min_samples_leaf': 2}",0.920701,0.913584,0.914757,0.916347,0.003115,1


In [None]:
bestmodel=grid.best_estimator_

In [None]:
#Results with Gridsearch
preds = bestmodel.predict(X_test)
print(classification_report(y_test, preds))

[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.1s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.2s finished


              precision    recall  f1-score   support

          -1       0.89      0.84      0.86       733
           0       0.96      0.98      0.97      6677
           1       0.98      0.93      0.95      2181

    accuracy                           0.96      9591
   macro avg       0.94      0.92      0.93      9591
weighted avg       0.96      0.96      0.96      9591



In [None]:
#Add model predictions to test data file to view results
test['modelresults']=preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
test.to_csv("modelresults.csv")

In [None]:
# Save model
pickle.dump(rfclf, open("sentiment_model.pkl", "wb"))

In [None]:
#Productionizing the Model: Define X_train and y_train
X_train= train["text"].tolist()
y_train=train["label"].tolist()

In [None]:
#Productionizing the Model: Create class 
class SentimentClassifier:
  def __init__(self, model_path: str = None,
               train_mode = True):
    self.model = None
    if model_path:
      self.load(model_path)
    self.train_mode = train_mode

  def load(self, model_path: str) -> bool:
    print("loading :", model_path)
    loaded_model = pickle.load(open(model_path, "rb"))
    self.model = loaded_model["model"]
    self.train_mode = False
    return True

  def save(self, model_path: str, model_name: str) -> str:
    model_properties = {"model": self.model}
    filename = model_path+model_name+".pkl"
    pickle.dump(model_properties, open(filename,"wb"))
    return filename
  
  def train(self, X: List[str], y: List[int] ) -> float:
    """" Take in X,y. Fit model to data and return
         the model's train accuracy """
    # 1. Clean X and convert features
    self.encoder = SentenceTransformer('roberta-base-nli-mean-tokens')
    X_train = self.encoder.encode(X)

    # 2. Load and fit model to X,y 
    self.model  = RandomForestClassifier(verbose=True, n_jobs=-1, criterion= 'entropy', min_samples_leaf=1)
    self.model.fit(X_train, y)

    # 3. Calcuate train accuracy
    train_acc = self.model.score(X_train, y)
    return train_acc
  
  def predict(self, inputs: List[str]) -> List[str]:
    """ Take in a list of string inputs and output
        a list of the model's predictions """

    # 1. Convert inputs into features
    input_feats = self.encoder.encode(inputs)

    # 2. Run model on features and get predictions
    preds = self.model.predict(input_feats)
    return preds

In [None]:
model = SentimentClassifier()
model.train(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   57.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:  2.1min finished
[Parallel(n_jobs=2)]: Using backend ThreadingBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:    0.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:    0.9s finished


0.9943171450170747

In [None]:
model.save("./", "saved_model")

In [None]:
new_model = SentimentModel("saved_model.pkl")