<a href="https://colab.research.google.com/github/jpkrajewski/Instagram-Follower-Bot/blob/main/NLP_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Natural Language Processing

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('generic_sentiment_dataset_50k.csv')
features = dataset.iloc[:, 1].values
labels = dataset.iloc[:, 2].values

In [None]:
print(features)

['good mobile. battery is 5000 mah is very big. camera quality very good.'
 'Overall in hand ecpirience is quite good matt finish really makes you feel premium battrey life supports you hole day still if you are a gamer not full tym 2 hrs of gaming can be easily sustainable for battrey camera is good but misses details many times potrait mode is lagging although processor and 90hz display makes device pretty good to handle oneplus should add some details update for camera or it will hampper their upcoming phone not in oneplus 7t also i have seen in other oneplus device like oneplus 7 pro and you got take care of camera and future updates iphone gives the best camera in current overall best flagship is non other than oneplus 7t with all the feature accept camera details problem'
 '1. Superb Camera,\n2. No lag\n3. This is my first Samsung phone, as earlier I used Xiomi, Asus and Nokia. No comparison with Samsung M31.'
 ... 'Fingerprint does not work properly'
 'Bakbass phone ever. Camera

## Cleaning the texts

In [None]:
import re

processed_features = []
for sentence in features:

  # Remove all the special characters
  processed_feature = re.sub(r'\W', ' ', str(sentence))

  # remove all single characters
  processed_feature= re.sub(r'\s+[a-zA-Z]\s+', ' ', processed_feature)

  # Remove single characters from the start
  processed_feature = re.sub(r'\^[a-zA-Z]\s+', ' ', processed_feature)

  # Substituting multiple spaces with single space
  processed_feature = re.sub(r'\s+', ' ', processed_feature, flags=re.I)

  # Removing prefixed 'b'
  processed_feature = re.sub(r'^b\s+', '', processed_feature)

  # Converting to Lowercase
  processed_feature = processed_feature.lower()
  processed_features.append(processed_feature)

## Creating the Bag of Words model

In [None]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1500, stop_words=stopwords.words('english'))
processed_features = vectorizer.fit_transform(processed_features).toarray()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

## Training the RandomForestClassifer model on the Training set

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=80, random_state=0)
rf_classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
rf_predictions = rf_classifier.predict(X_test)

### Making the Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, rf_predictions))
print(classification_report(y_test, rf_predictions))
print(accuracy_score(y_test, rf_predictions))


[[1990  251  524]
 [ 612  483  858]
 [ 376  201 4705]]
              precision    recall  f1-score   support

           0       0.67      0.72      0.69      2765
           1       0.52      0.25      0.33      1953
           2       0.77      0.89      0.83      5282

    accuracy                           0.72     10000
   macro avg       0.65      0.62      0.62     10000
weighted avg       0.69      0.72      0.69     10000

0.7178


## Training the KNN model on the Training set

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
knn_predictions = knn_classifier.predict(X_test)

### Making the Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, knn_predictions))
print(classification_report(y_test, knn_predictions))
print(accuracy_score(y_test, knn_predictions))


[[ 928  262 1575]
 [ 371  350 1232]
 [ 329  252 4701]]
              precision    recall  f1-score   support

           0       0.57      0.34      0.42      2765
           1       0.41      0.18      0.25      1953
           2       0.63      0.89      0.74      5282

    accuracy                           0.60     10000
   macro avg       0.53      0.47      0.47     10000
weighted avg       0.57      0.60      0.55     10000

0.5979


## Training the Bayes model on the Training set

In [None]:
from sklearn.naive_bayes import GaussianNB

gnb_classifier = GaussianNB()
gnb_classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
gnb_predictions = gnb_classifier.predict(X_test)

### Making the Confusion Matrix

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(confusion_matrix(y_test, gnb_predictions))
print(classification_report(y_test, gnb_predictions))
print(accuracy_score(y_test, gnb_predictions))


[[1911  463  391]
 [ 707  604  642]
 [ 627  655 4000]]
              precision    recall  f1-score   support

           0       0.59      0.69      0.64      2765
           1       0.35      0.31      0.33      1953
           2       0.79      0.76      0.78      5282

    accuracy                           0.65     10000
   macro avg       0.58      0.59      0.58     10000
weighted avg       0.65      0.65      0.65     10000

0.6515


## Downloading the model to deploy in production

The RandomForestClassifer has the best accuracy, so I am choosing this model for application.

In [None]:
import joblib
joblib.dump(rf_classifier, 'finalized_model.sav')

['finalized_model.sav']

In [None]:
joblib.dump(vectorizer, 'vectorizer.sav')

['vectorizer.sav']