 **Importing the Dependencies**

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [16]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [17]:
# printing stopwords in english
print(stopwords.words('english'))

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

**Data Preprocessing**

In [18]:
#loading the data from csv file to pandas dataframe
data = pd.read_csv('/content/Twitter.csv', encoding="latin-1")


In [19]:
# checking the number of rows and columns
data.shape

(1599999, 6)

In [20]:
# printing the first 5 rows of the dataframe
data.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [24]:
# Naming the column and reading dataset again
column_names=['target','id','date','flag','user','text']
data=pd.read_csv("/content/Twitter.csv",names=column_names,encoding="latin-1")

In [25]:
# Checking the number of rows and column
data.shape

(1600000, 6)

In [26]:
# printing the first 5 rows of data
data.head()

Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [27]:
# Counting the number of missing values in the dataset
data.isnull().sum()

Unnamed: 0,0
target,0
id,0
date,0
flag,0
user,0
text,0


In [28]:
# Checking the distribution of target variable
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
4,800000


**Convert the target "4" into "1"**

In [29]:
data.replace({'target':{4:1}},inplace=True)

In [30]:
# Checking the distribution of target variable
data['target'].value_counts()

Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
0,800000
1,800000


 **0->Negative Tweet,1->Positive Tweet**

**Stemming**

In [31]:
ps=PorterStemmer()


In [32]:
def stemming(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep spaces
    text = text.lower()
    text = text.split()
    text = [word for word in text if word not in stopwords.words('english')]
    text = [ps.stem(word) for word in text]
    return ' '.join(text)  # Maintain readability

In [33]:
data['Stemmed_data']=data['text'].apply(stemming)

In [34]:
data.head()

Unnamed: 0,target,id,date,flag,user,text,Stemmed_data
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot httptwitpiccomyzl awww that bummer ...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset cant updat facebook text might cri resul...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav im mad cant see


In [35]:
print(data['Stemmed_data'])

0          switchfoot httptwitpiccomyzl awww that bummer ...
1          upset cant updat facebook text might cri resul...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                      nationwideclass behav im mad cant see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdbcom cool hear old walt interview httpbli...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: Stemmed_data, Length: 1600000, dtype: object


In [36]:
print(data['target'])


0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


**Converting Text data into numercial**

In [37]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(data['Stemmed_data'])

In [38]:
Y = data['target'].values

**Splitting the data into train and test**

In [39]:
x_train, x_test, y_train, y_test = train_test_split(X_tfidf, Y,test_size=0.3)


**Model creation**

In [40]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=20, max_depth=10),
    "Multinomial Naive Bayes": MultinomialNB()
}

for name, model in models.items():
    model.fit(x_train, y_train)
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    print(f"\n{name}:")
    print(f"Accuracy on Training Data: {accuracy_score(y_train, y_train_pred) * 100:.2f}%")
    print(f"Accuracy on Testing Data: {accuracy_score(y_test, y_test_pred) * 100:.2f}%")
    print(f"Classification Report:\n{classification_report(y_test, y_test_pred)}")
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_test_pred)}")




Logistic Regression:
Accuracy on Training Data: 80.59%
Accuracy on Testing Data: 78.03%
Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.75      0.77    240254
           1       0.77      0.81      0.79    239746

    accuracy                           0.78    480000
   macro avg       0.78      0.78      0.78    480000
weighted avg       0.78      0.78      0.78    480000

Confusion Matrix:
[[180886  59368]
 [ 46102 193644]]

Random Forest:
Accuracy on Training Data: 60.15%
Accuracy on Testing Data: 59.85%
Classification Report:
              precision    recall  f1-score   support

           0       0.65      0.42      0.51    240254
           1       0.57      0.77      0.66    239746

    accuracy                           0.60    480000
   macro avg       0.61      0.60      0.59    480000
weighted avg       0.61      0.60      0.59    480000

Confusion Matrix:
[[101521 138733]
 [ 53991 185755]]

Multinomial Naive Bay

**Prediction System**

In [47]:
def predict_sentiment(text, model, vectorizer, stop_words, ps):
    processed_text = stemming(text)

    text_tfidf = vectorizer.transform([processed_text])

    prediction = model.predict(text_tfidf)

    return "Positive" if prediction[0] == 1 else "Negative"


new_input = "Farrah Fawcett passed away...so sad "
predicted_sentiment = predict_sentiment(new_input, model, vectorizer, stopwords, ps)
print(f"\nPredicted Sentiment for '{new_input}': {predicted_sentiment}")


Predicted Sentiment for 'Farrah Fawcett passed away...so sad ': Negative
