In [2]:
pip install numpy pandas scikit-learn nltk




In [6]:
import pandas as pd

file_path = "/content/train.csv"

df = pd.read_csv(file_path, encoding="latin-1")
print(df.head())



       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138                          my boss is bullying me...   
3  9642c003ef                     what interview! leave me alone   
4  358bd9e861   Sons of ****, why couldn`t they put them on t...   

                         selected_text sentiment Time of Tweet Age of User  \
0  I`d have responded, if I were going   neutral       morning        0-20   
1                             Sooo SAD  negative          noon       21-30   
2                          bullying me  negative         night       31-45   
3                       leave me alone  negative       morning       46-60   
4                        Sons of ****,  negative          noon       60-70   

       Country  Population -2020  Land Area (Km²)  Density (P/Km²)  
0  Afghanistan          38928346         652860.0    

In [7]:
filtered_df = df[['text', 'sentiment']]
print(filtered_df.head())
df=filtered_df



                                                text sentiment
0                I`d have responded, if I were going   neutral
1      Sooo SAD I will miss you here in San Diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   Sons of ****, why couldn`t they put them on t...  negative


In [8]:

sentiment_counts = df['sentiment'].value_counts()
print(sentiment_counts)




sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64


In [9]:
from sklearn.utils import resample

positive = df[df['sentiment'] == "positive"]
neutral = df[df['sentiment'] == "neutral"]
negative = df[df['sentiment'] == "negative"]


max_size= max(len(positive),len(negative),len(neutral))


positive_balanced = resample(positive, replace=True, n_samples=max_size, random_state=42)
neutral_balanced = resample(neutral, replace=True, n_samples=max_size, random_state=42)
negative_balanced = resample(negative, replace=True, n_samples=max_size, random_state=42)

df_balanced = pd.concat([positive_balanced, neutral_balanced, negative_balanced])


In [10]:

sentiment_counts = df_balanced['sentiment'].value_counts()


print(sentiment_counts)

df=df_balanced

sentiment
positive    11118
neutral     11118
negative    11118
Name: count, dtype: int64


In [11]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [12]:
import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('punkt_tab')

def clean_text(text):
  text=re.sub(r"http\S+|www\S+|https\S+","",text,flags=re.MULTILINE)
  text = re.sub(r"@\w+|#\w+", "", text)
  text = re.sub(r"[^\w\s]", "", text)
  text = text.lower()
  return text


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [13]:
stop_words=set(stopwords.words("english"))
def preprocess_text(text):
    if not isinstance(text, str):
        text = ""

    text = clean_text(text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]  # Case insensitive
    return " ".join(tokens)

df["cleaned_text"] = df["text"].apply(preprocess_text)

def map_sentiment(value):
    if value == "positive":
        return 1
    elif value == "neutral":
        return 0
    elif value == "negative":
        return -1
    else:
        return None

df["sentiment"] = df["sentiment"].apply(map_sentiment)


In [14]:
print (df.head())

                                                    text  sentiment  \
23091  grinning like a Cheshire cat.... ....happy as ...          1   
2666   Got the sniffles   I SO don`t want to get sick...          1   
17190  havent been on here in ages  sorry twitter.. t...          1   
16540   Yes Hindustan Rocks dude! Dunia mein asay koi...          1   
18192                      _anderson hehe   fun tweets !          1   

                                            cleaned_text  
23091     grinning like cheshire cat happy hell made day  
2666           got sniffles dont want get sick dont need  
17190         havent ages sorry twitter tweetdeck broken  
16540  yes hindustan rocks dude dunia mein asay koi f...  
18192                          _anderson hehe fun tweets  


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df["cleaned_text"]).toarray()
y = df["sentiment"].values

print( df.head())


                                                    text  sentiment  \
23091  grinning like a Cheshire cat.... ....happy as ...          1   
2666   Got the sniffles   I SO don`t want to get sick...          1   
17190  havent been on here in ages  sorry twitter.. t...          1   
16540   Yes Hindustan Rocks dude! Dunia mein asay koi...          1   
18192                      _anderson hehe   fun tweets !          1   

                                            cleaned_text  
23091     grinning like cheshire cat happy hell made day  
2666           got sniffles dont want get sick dont need  
17190         havent ages sorry twitter tweetdeck broken  
16540  yes hindustan rocks dude dunia mein asay koi f...  
18192                          _anderson hehe fun tweets  


In [16]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(max_features=5000)
X = vectorizer.fit_transform(df["cleaned_text"]).toarray()
y = df["sentiment"].values

print( df.head())


                                                    text  sentiment  \
23091  grinning like a Cheshire cat.... ....happy as ...          1   
2666   Got the sniffles   I SO don`t want to get sick...          1   
17190  havent been on here in ages  sorry twitter.. t...          1   
16540   Yes Hindustan Rocks dude! Dunia mein asay koi...          1   
18192                      _anderson hehe   fun tweets !          1   

                                            cleaned_text  
23091     grinning like cheshire cat happy hell made day  
2666           got sniffles dont want get sick dont need  
17190         havent ages sorry twitter tweetdeck broken  
16540  yes hindustan rocks dude dunia mein asay koi f...  
18192                          _anderson hehe fun tweets  


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
#logistic regression
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7868385549392894
              precision    recall  f1-score   support

          -1       0.81      0.78      0.79      2221
           0       0.72      0.76      0.74      2232
           1       0.84      0.83      0.83      2218

    accuracy                           0.79      6671
   macro avg       0.79      0.79      0.79      6671
weighted avg       0.79      0.79      0.79      6671



In [20]:
from sklearn.naive_bayes import MultinomialNB

#naive bayes
model = MultinomialNB()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 0.7103882476390346
              precision    recall  f1-score   support

          -1       0.73      0.76      0.75      2221
           0       0.64      0.60      0.62      2232
           1       0.75      0.78      0.76      2218

    accuracy                           0.71      6671
   macro avg       0.71      0.71      0.71      6671
weighted avg       0.71      0.71      0.71      6671



In [None]:
from sklearn.svm import SVC

# SVM
model = SVC(kernel="linear")
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
