### Importing relevant libraries that will be used throughout this project:

In [None]:
import re
import os
import cv2
import glob
import nltk
import pickle
import natsort
import warnings
import numpy as np
import pandas as pd
from PIL import Image
from os import listdir
from pathlib import Path
from sklearn import tree
from PIL import ImageFile
from nltk.corpus import stopwords
from pytesseract import pytesseract
from nltk.stem import WordNetLemmatizer
from sklearn.datasets import load_files
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, f1_score

In [None]:
warnings.filterwarnings('ignore')
ImageFile.LOAD_TRUNCATED_IMAGES = True
path_to_tesseract = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"

### Pre-processing of text dataset:-

In [None]:
df = pd.read_csv(r"C:\Users\user\Desktop\Project\labels.csv")
df

Unnamed: 0.1,Unnamed: 0,image_name,text_ocr,text_corrected,overall_sentiment
0,0,image_1.jpg,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,LOOK THERE MY FRIEND LIGHTYEAR NOW ALL SOHALIK...,very_positive
1,1,image_2.jpeg,The best of #10 YearChallenge! Completed in le...,The best of #10 YearChallenge! Completed in le...,very_positive
2,2,image_3.JPG,Sam Thorne @Strippin ( Follow Follow Saw every...,Sam Thorne @Strippin ( Follow Follow Saw every...,positive
3,3,image_4.png,10 Year Challenge - Sweet Dee Edition,10 Year Challenge - Sweet Dee Edition,positive
4,4,image_5.png,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,10 YEAR CHALLENGE WITH NO FILTER 47 Hilarious ...,neutral
...,...,...,...,...,...
6987,6987,image_6988.jpg,Tuesday is Mardi Gras Wednesday is Valentine's...,Tuesday is Mardi Gras Wednesday is Valentine's...,neutral
6988,6988,image_6989.jpg,MUST WATCH MOVIES OF 2017 ITI Chennai memes MA...,MUST WATCH MOVIES OF 2017 ITI Chennai memes MA...,neutral
6989,6989,image_6990.png,LESS MORE TALKING PLANNING SODA JUNK FOOD COMP...,LESS MORE TALKING PLANNING SODA JUNK FOOD COMP...,positive
6990,6990,image_6991.jpg,When I VERY have time is a fantasy No one has ...,When I have time is a fantasy. no one has time...,very_positive


In [None]:
X = df.text_corrected
df.drop(['Unnamed: 0'], axis = 1, inplace = True)

senti = df['overall_sentiment']
df.drop(['image_name', 'text_ocr'], axis = 1, inplace = True)

In [None]:
cv = CountVectorizer(max_features = 1500, min_df = 5, max_df = 0.8)
X = cv.fit_transform(X.apply(lambda x: np.str_(x)))

In [None]:
y = {'very_positive': 1, 'positive': 1, 'neutral': 0, 'negative': -1, 'very_negative': -1}
X_train, X_test, y_train, y_test = train_test_split(X, senti.map(y), test_size = 0.25, random_state = 50)

df['overall_sentiment'].value_counts()     #from here we find that our dataset is imbalanced!

positive         3127
neutral          2201
very_positive    1033
negative          480
very_negative     151
Name: overall_sentiment, dtype: int64

In [None]:
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

over_sampler = RandomOverSampler(random_state = 45)
X_res, y_res = over_sampler.fit_resample(X_train, y_train)

under_sampler = RandomUnderSampler(random_state = 45)
X_tes, y_tes = under_sampler.fit_resample(X_test, y_test)

print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_tes)}")

Training target statistics: Counter({-1: 3084, 1: 3084, 0: 3084})
Testing target statistics: Counter({-1: 160, 0: 160, 1: 160})


### Training K-Neighbours Classifier (for text) & evaluating model:

In [None]:
neigh_text = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 30, weights = 'uniform')
neigh_text = neigh_text.fit(X_res, y_res)
y_pred = neigh_text.predict(X_tes)

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_tes, y_pred))
print("\n\n", classification_report(y_tes, y_pred))
print("F1 score is:", f1_score(y_tes, y_pred, average = "macro"))

Confusion Matrix:

 [[ 46  96  18]
 [ 28 104  28]
 [ 37  92  31]]


               precision    recall  f1-score   support

          -1       0.41      0.29      0.34       160
           0       0.36      0.65      0.46       160
           1       0.40      0.19      0.26       160

    accuracy                           0.38       480
   macro avg       0.39      0.38      0.35       480
weighted avg       0.39      0.38      0.35       480

F1 score is: 0.35375458717060565


In [None]:
pkl_filename1 = "KNN_Text_Classifier.pkl"

with open(pkl_filename1, 'wb') as file:
    pickle.dump(neigh_text, file)

## Training Decision Tree Classifier (for text) & evaluating model:

In [None]:
clf_text = tree.DecisionTreeClassifier(criterion = 'entropy', splitter = 'random', random_state = 10, max_depth = 100,
                                  class_weight = 'balanced', min_samples_split = 100, min_samples_leaf = 50)
clf_text = clf_text.fit(X_res, y_res)
y_pred = clf_text.predict(X_tes)

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_tes,y_pred))
print("\n\n", classification_report(y_tes,y_pred))
print("F1 score is:", f1_score(y_tes, y_pred, average = "macro"))

Confusion Matrix:

 [[72 68 20]
 [60 78 22]
 [62 72 26]]


               precision    recall  f1-score   support

          -1       0.37      0.45      0.41       160
           0       0.36      0.49      0.41       160
           1       0.38      0.16      0.23       160

    accuracy                           0.37       480
   macro avg       0.37      0.37      0.35       480
weighted avg       0.37      0.37      0.35       480

F1 score is: 0.3491827497179862


In [None]:
pkl_filename2 = "Decision_Tree_Classifier.pkl"

with open(pkl_filename2, 'wb') as file:
    pickle.dump(clf_text, file)

## Training Logistic Regression model (for text) & evaluating:

In [None]:
LR = LogisticRegression(C = 0.01, penalty = 'l2', solver = 'liblinear', random_state = 10).fit(X_res, y_res)
y_pred = LR.predict(X_tes)

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_tes,y_pred))
print("\n\n", classification_report(y_tes,y_pred))
print("F1 score is:", f1_score(y_tes, y_pred, average = "macro"))

Confusion Matrix:

 [[51 53 56]
 [51 61 48]
 [40 51 69]]


               precision    recall  f1-score   support

          -1       0.36      0.32      0.34       160
           0       0.37      0.38      0.38       160
           1       0.40      0.43      0.41       160

    accuracy                           0.38       480
   macro avg       0.38      0.38      0.38       480
weighted avg       0.38      0.38      0.38       480

F1 score is: 0.3758491247232969


In [None]:
pkl_filename3 = "Logistic_Regression_Classifier.pkl"

with open(pkl_filename3, 'wb') as file:
    pickle.dump(LR, file)

### Pre-processing of images dataset:

In [None]:
from skimage import color
from skimage.io import imread
from skimage.feature import hog
from skimage.transform import resize

In [None]:
file = []
folder_dir = "C:/Users/user/Desktop/Project/images"

for images in os.listdir(folder_dir):
    file.append(images)

file = natsort.natsorted(file) #to sort images according to their numbers!

In [None]:
col = ['Fd', 'overall_sentiment']
df2 = pd.DataFrame(columns = col)

In [None]:
for i in range(len(file)):

    imag_pth = r"C:\Users\user\Desktop\Project\images\{}".format(file[i])
    img = imread(imag_pth)

    try:
        nx, ny, nrgb = img.shape

    except:
        img = color.gray2rgb(img)
        nx, ny, nrgb= img.shape

    x_train2 = img.reshape(nx, ny, nrgb)
    resized_img = resize(x_train2, (128*4, 64*4))
    fd, hog_image = hog(resized_img, orientations=9, pixels_per_cell=(8, 8),
                        cells_per_block=(2, 2), visualize=True, multichannel=True)

    df2 = df2.append({'Fd': fd, 'overall_sentiment': senti[i]}, ignore_index = True)

In [None]:
df2.to_csv("Image_Fd.csv")
df2

Unnamed: 0,Fd,overall_sentiment
0,"[0.16246362069040354, 0.08444857285499856, 0.0...",very_positive
1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",very_positive
2,"[0.4695715440209436, 0.007319040344629182, 0.0...",positive
3,"[0.2769939213359277, 0.08707085859377085, 0.03...",positive
4,"[0.022272748396145056, 0.0, 0.0059946661863631...",neutral
...,...,...
6987,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",neutral
6988,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",neutral
6989,"[0.042786180791997454, 0.0, 0.0307012700991192...",positive
6990,"[0.46689913471269245, 0.09902658307625116, 0.0...",very_positive


In [None]:
X_pic = df2.Fd
senti_pic = df2['overall_sentiment']
y_pic = {'very_positive': 1, 'positive': 1, 'neutral': 0, 'negative': -1, 'very_negative': -1}
X_train_pic, X_test_pic, y_train_pic, y_test_pic = train_test_split(X_pic, senti_pic.map(y_pic),
                                                                    test_size = 0.2, random_state = 50)

### Training Decision Tree Classifier (**for images**) & evaluating model:

In [None]:
clf_img = tree.DecisionTreeClassifier(criterion = 'entropy', splitter = 'random', random_state = 10, max_depth = 100,
                                      min_samples_split = 100, min_samples_leaf = 50)
clf_img = clf_img.fit(list(X_train_pic), y_train_pic)
y_pred_pic = clf_img.predict(list(X_test_pic))

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_test_pic, y_pred_pic))
print("\n\n", classification_report(y_test_pic, y_pred_pic))
print("F1 score is of:", f1_score(y_test_pic, y_pred_pic, average = "macro"))

Confusion Matrix:

 [[  2  36  93]
 [  8 107 296]
 [ 13 221 623]]


               precision    recall  f1-score   support

          -1       0.09      0.02      0.03       131
           0       0.29      0.26      0.28       411
           1       0.62      0.73      0.67       857

    accuracy                           0.52      1399
   macro avg       0.33      0.33      0.32      1399
weighted avg       0.47      0.52      0.49      1399

F1 score is of: 0.322923241632919


In [None]:
pkl_filename4 = "Decision_Tree_img_Classifier.pkl"

with open(pkl_filename4, 'wb') as file:
    pickle.dump(clf_img, file)

### Training K-Neighbour Classifier (**for images**) & evaluating model:

In [None]:
neigh = KNeighborsClassifier(metric = 'manhattan', n_neighbors = 5)
neigh = neigh.fit(list(X_train_pic), y_train_pic)
y_pred_pic = neigh.predict(list(X_test_pic))

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_test_pic, y_pred_pic))
print("\n\n", classification_report(y_test_pic, y_pred_pic))
print("F1 score is: ", f1_score(y_test_pic, y_pred_pic, average = "macro"))

Confusion Matrix:

 [[  8  41  82]
 [ 24 120 267]
 [ 39 277 541]]


               precision    recall  f1-score   support

          -1       0.11      0.06      0.08       131
           0       0.27      0.29      0.28       411
           1       0.61      0.63      0.62       857

    accuracy                           0.48      1399
   macro avg       0.33      0.33      0.33      1399
weighted avg       0.46      0.48      0.47      1399

F1 score is:  0.32708029531191934


In [None]:
pkl_filename5 = "KNN_img_Classifier.pkl"

with open(pkl_filename5, 'wb') as file:
    pickle.dump(neigh, file)

### Training Logistic Regression model (**for images**) & evaluating model:

In [None]:
LR_img = LogisticRegression(C = 0.01, penalty = 'l2', solver = 'liblinear', random_state = 10).fit(list(X_train_pic), y_train_pic)
y_pred_pic = LR_img.predict(list(X_test_pic))

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_test_pic, y_pred_pic))
print("\n\n", classification_report(y_test_pic, y_pred_pic))
print("F1 score of is:", f1_score(y_test_pic, y_pred_pic, average = "macro"))

Confusion Matrix:

 [[  1  24 106]
 [  0  95 316]
 [  2 171 684]]


               precision    recall  f1-score   support

          -1       0.33      0.01      0.01       131
           0       0.33      0.23      0.27       411
           1       0.62      0.80      0.70       857

    accuracy                           0.56      1399
   macro avg       0.43      0.35      0.33      1399
weighted avg       0.51      0.56      0.51      1399

F1 score of is: 0.32761975135618626


In [None]:
pkl_filename6 = "Logistic_Regression_img_Classifier.pkl"

with open(pkl_filename6, 'wb') as file:
    pickle.dump(LR_img, file)

### Saving Text Models:

In [None]:
with open(pkl_filename1, 'rb') as file:
    Pkl_KNN_img = pickle.load(file)

Pkl_KNN_img

In [None]:
with open(pkl_filename2, 'rb') as file:
    Pkl_decision_tree = pickle.load(file)

Pkl_decision_tree

In [None]:
with open(pkl_filename3, 'rb') as file:
    Pkl_logistic_regression = pickle.load(file)

Pkl_logistic_regression

### Saving Image Models:

In [None]:
with open(pkl_filename4, 'rb') as file:
    Pkl_decision_tree_img = pickle.load(file)

Pkl_decision_tree_img

In [None]:
with open(pkl_filename5, 'rb') as file:
    Pkl_knn_img = pickle.load(file)

Pkl_knn_img

In [None]:
with open(pkl_filename6, 'rb') as file:
    Pkl_logistic_regression_img = pickle.load(file)

Pkl_logistic_regression_img

### Applying voting classifier (**for text**) and ensembling trained models:

In [None]:
from sklearn.ensemble import VotingClassifier

In [None]:
for_text = []
for_text.append(('LR', LogisticRegression(C = 0.01, penalty = 'l2', solver = 'liblinear', random_state = 10)))
for_text.append(('KNC', KNeighborsClassifier(metric = 'manhattan', n_neighbors = 30, weights = 'uniform')))
for_text.append(('DTC', tree.DecisionTreeClassifier(random_state = 10, class_weight = 'balanced')))

vot_hard = VotingClassifier(estimators = for_text, voting = 'hard')
vot_hard.fit(X_res, y_res)
y_pred = vot_hard.predict(X_tes)

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_tes,y_pred))
print("\n\n", classification_report(y_tes,y_pred))
print("F1 score of Voting Classifier is:", f1_score(y_tes, y_pred, average = "macro"))

Confusion Matrix:

 [[63 57 40]
 [46 70 44]
 [47 56 57]]


               precision    recall  f1-score   support

          -1       0.40      0.39      0.40       160
           0       0.38      0.44      0.41       160
           1       0.40      0.36      0.38       160

    accuracy                           0.40       480
   macro avg       0.40      0.40      0.40       480
weighted avg       0.40      0.40      0.40       480

F1 score of Voting Classifier is: 0.3952116613498505


In [None]:
pkl_filename7 = "Voting_Classifier.pkl"

with open(pkl_filename7, 'wb') as file:
    pickle.dump(vot_hard, file)

### Applying voting classifier (**for images**) and ensembling trained models:

In [None]:
for_image = []
for_image.append(('DTC', tree.DecisionTreeClassifier(random_state = 10)))
for_image.append(('KNC', KNeighborsClassifier(n_neighbors = 5)))
for_image.append(('LR', LogisticRegression(multi_class = 'multinomial', random_state = 10)))

vot_hard_img = VotingClassifier(estimators = for_image, voting = 'soft')
vot_hard_img.fit(list(X_train_pic), y_train_pic)
y_pred = vot_hard_img.predict(list(X_test_pic))

In [None]:
print("Confusion Matrix:\n\n", confusion_matrix(y_test_pic,y_pred))
print("\n\n", classification_report(y_test_pic,y_pred))
print("F1 score of Voting Classifier is:", f1_score(y_test_pic, y_pred, average = "macro"))

Confusion Matrix:

 [[  1  25  64]
 [ 10  82 227]
 [ 17 176 447]]


               precision    recall  f1-score   support

          -1       0.04      0.01      0.02        90
           0       0.29      0.26      0.27       319
           1       0.61      0.70      0.65       640

    accuracy                           0.51      1049
   macro avg       0.31      0.32      0.31      1049
weighted avg       0.46      0.51      0.48      1049

F1 score of Voting Classifier is: 0.3127135765744731


In [None]:
pkl_filename8 = "Voting_img_Classifier.pkl"

with open(pkl_filename8, 'wb') as file:
    pickle.dump(vot_hard_img, file)

### Printing overall F1 score (**for all** trained models):

In [None]:
print("Overall F1 score is:", (35.37+34.92+37.58+33.19+32.71+32.76)/600)

Overall F1 score is: 0.34421666666666667
