<a href="https://colab.research.google.com/github/lauraAriasFdez/SentimentAnalysis/blob/main/project_tfif.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1. Connect To Google Drive + Get Data


In [None]:
# MAIN DIRECTORY STILL TO DO 
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
data_file = "/content/gdrive/MyDrive/CSCI4511W/project/sentiments.csv"

In [None]:
import pandas as pd
import numpy as np

cols = ['sentiment','id','date','query_string','user','text']
sms_data = pd.read_csv(data_file, encoding='latin-1',header=None,names=cols)

# replace lables 0 = neg  1= pos
sms_data.sentiment = sms_data.sentiment.replace({0: 0, 4: 1})


labels = sms_data[sms_data.columns[0]].to_numpy()


### Preprocess Data


In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#We import English stop-words from the NLTK package and removed them if found in the sentence.
#While removing stop-words, we perform stemming that is if the word is not a stop-word, it will be converted to its root form. This is called stemming.

"""
https://stackoverflow.com/questions/52026677/sentiment140-preprocessing
https://www.analyticsvidhya.com/blog/2020/11/understanding-naive-bayes-svm-and-its-implementation-on-spam-sms/


"""
def clean_data(content):
  stemming = PorterStemmer()

  for i in range (0,len(content)):

    ## print where in cleaning they are
    if (i%1000000==0):
      print(i ," already cleaned")

    #remove @mentions
    tweet = re.sub(r'@[A-Za-z0-9]+',"",content[i]) 
    #remove urls
    tweet = re.sub(r'https?:\/\/\S+',"",tweet) 

    #remove all unecessary charachters like punctuations
    tweet = re.sub('[^a-zA-Z]',repl = ' ',string = tweet)
    tweet.lower()
    tweet = tweet.split()

    ## steeeming and remove stop words
    tweet = [stemming.stem(word) for word in tweet if word not in set(stopwords.words('english'))]
    tweet = ' '.join(tweet)

    #cleaned Twwet
    content[i] = tweet
  return content


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.



https://getpocket.com/read/3040941140

 Texthero is designed as a Pandas wrapper, so it makes it easier than ever to preprocess and analyze text based Pandas Series

In [None]:
!pip install texthero
import pandas as pd
import texthero as hero #config import cid, csec, ua

Collecting texthero
  Downloading texthero-1.1.0-py3-none-any.whl (24 kB)
Collecting unidecode>=1.1.1
  Downloading Unidecode-1.3.4-py3-none-any.whl (235 kB)
[K     |████████████████████████████████| 235 kB 7.0 MB/s 
Collecting nltk>=3.3
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 42.7 MB/s 
Collecting regex>=2021.8.3
  Downloading regex-2022.3.15-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (749 kB)
[K     |████████████████████████████████| 749 kB 51.4 MB/s 
Installing collected packages: regex, unidecode, nltk, texthero
  Attempting uninstall: regex
    Found existing installation: regex 2019.12.20
    Uninstalling regex-2019.12.20:
      Successfully uninstalled regex-2019.12.20
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.5
    Uninstalling nltk-3.2.5:
      Successfully uninstalled nltk-3.2.5
Successfully installed nltk-3.7 regex-2022.3.15 texthero-1.1.0 unidecode-1.3.4


In [None]:

custom_cleaning = [
  
  #Replace not assigned values with empty space
  hero.preprocessing.fillna,
  hero.preprocessing.lowercase,
  hero.preprocessing.remove_digits,
  hero.preprocessing.remove_punctuation,
  hero.preprocessing.remove_diacritics,
  hero.preprocessing.remove_stopwords,
  hero.preprocessing.remove_whitespace,
  hero.preprocessing.stem


]

content = hero.clean(sms_data['text'], pipeline = custom_cleaning)


In [None]:
#content = content.to_numpy()

### TF-IDF Feature Extraction 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf_data = tfidf.fit_transform(content)

In [None]:
from sklearn.model_selection import train_test_split
tfidf_x_train,tfidf_x_test,y_train,y_test = train_test_split(tfidf_data,labels,test_size = 0.3, stratify=labels,random_state=100)

### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import f1_score

# NAIVE BAYES + TLF 
print("NAIVE BAYES + TLF______________________________________________________________")
clf_multinomialnb = MultinomialNB()
clf_multinomialnb.fit(tfidf_x_train,y_train)

y_pred = clf_multinomialnb.predict(tfidf_x_test)
print(classification_report(y_test,y_pred))

#>>> f1_score(y_true, y_pred, average='weighted')
f1_score(y_test,y_pred)

NAIVE BAYES + TLF______________________________________________________________
              precision    recall  f1-score   support

           0       0.74      0.79      0.76    240000
           1       0.77      0.72      0.75    240000

    accuracy                           0.76    480000
   macro avg       0.76      0.76      0.76    480000
weighted avg       0.76      0.76      0.76    480000



0.7475814603720831

### SVM

In [None]:
from sklearn.svm import LinearSVC

# SVM + TLF 
print("LINEAR SVM + TLF______________________________________________________________")
linearsvc = LinearSVC()
linearsvc.fit(tfidf_x_train,y_train)
y_pred = linearsvc.predict(tfidf_x_test)

print(classification_report(y_test,y_pred))
f1_score(y_test,y_pred)

LINEAR SVM + TLF______________________________________________________________
              precision    recall  f1-score   support

           0       0.78      0.75      0.77    240000
           1       0.76      0.79      0.77    240000

    accuracy                           0.77    480000
   macro avg       0.77      0.77      0.77    480000
weighted avg       0.77      0.77      0.77    480000



0.774352787433251

### Logistic Regression


In [None]:
#https://towardsdatascience.com/logistic-regression-using-python-sklearn-numpy-mnist-handwriting-recognition-matplotlib-a6b31e2b166a

from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression()
logisticRegr.fit(tfidf_x_train,y_train)

y_pred = logisticRegr.predict(tfidf_x_test)

print(classification_report(y_test,y_pred))
f1_score(y_test,y_pred)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


              precision    recall  f1-score   support

           0       0.79      0.75      0.77    240000
           1       0.76      0.80      0.78    240000

    accuracy                           0.78    480000
   macro avg       0.78      0.78      0.78    480000
weighted avg       0.78      0.78      0.78    480000



0.7841143797154724