**Fetch the data**

In [1]:
import pandas as pd 

tweets = pd.read_csv("Tweets.csv")

**Divide the dataset into features and labels sets**

In [2]:
features = tweets.iloc[:, 10].values
labels = tweets.iloc[:, 1].values

**Preprocess the features**
* Replace special charcters with a space
* Replace all single charcaters with a space
* Remove single characters from the start with a space
* Substitute multiple spaces with a single space
* Remove prefixed 'b' present for bytes format strings
* Convert to lower case

In [3]:
import re

features_pp = []
for i in range(0, len(features)):
    feature_pp = re.sub(r'\W', ' ', str(features[i]))
    feature_pp = re.sub(r'\s+[a-zA-Z]\s+', ' ', feature_pp)
    feature_pp = re.sub(r'\^[a-zA-Z]\s+', ' ', feature_pp)
    feature_pp = re.sub(r'\s+', ' ', feature_pp, flags=re.I)
    feature_pp = re.sub(r'^b\s+', '', feature_pp)
    feature_pp = feature_pp.lower()
    features_pp.append(feature_pp)

**Vectorize the preprocessed features and remove stopwords**
* Use 2500 most frequently occuring words to create the bag of words (max_features)
* Include words that occur in at least 7 documents (min_df)
* Use those words that occur in a maximum of 80% of the documents (max_df)
* Remove stopwords

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

vectorizer = TfidfVectorizer(max_features=2500, 
    min_df=7, max_df=0.8, stop_words=stopwords.words('english'))
features_pp = vectorizer.fit_transform(features_pp).toarray()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rchattopadhyay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


**Split the data into Training and Test sets**
* Test data size is 0.2 i.e. 20% of the data
* Train data size is the remaining 80%.

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    features_pp, labels, test_size=0.2, random_state=0)

**Train the model**

In [6]:
from sklearn.ensemble import RandomForestClassifier

classifier_rf = RandomForestClassifier(n_estimators=200, random_state=0)
classifier_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

**Make predictions**

In [7]:
predictions = classifier_rf.predict(X_test)

**Evaluate the model**
* Confusion matrix
* Classification report
* Accuracy score

In [8]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print("\nConfusion Matrix")
print(confusion_matrix(y_test, predictions))

print("\nClassification Report")
print(classification_report(y_test, predictions))

print("\nAccuracy")
print(accuracy_score(y_test, predictions)*100, '%')


Confusion Matrix
[[1723  108   39]
 [ 326  248   40]
 [ 132   58  254]]

Classification Report
              precision    recall  f1-score   support

    negative       0.79      0.92      0.85      1870
     neutral       0.60      0.40      0.48       614
    positive       0.76      0.57      0.65       444

    accuracy                           0.76      2928
   macro avg       0.72      0.63      0.66      2928
weighted avg       0.75      0.76      0.74      2928


Accuracy
75.99043715846994 %


**Save the model**

In [9]:
import pickle

pickle.dump(classifier_rf, open('classifier_rf.sav', 'wb'))

**Load the model**

In [10]:
modelFile = 'classifier_rf.sav'

model = pickle.load(open(modelFile, 'rb'))
print('Model loaded:', model.score(X_test, y_test)*100, '% accuracy')

Model loaded: 75.99043715846994 % accuracy
