In [54]:
#Importing all required packages

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [89]:
#Loading Training and Validation Data
training_data = pd.read_csv('repos/final-project/twitter_training.csv', header=None)
validation_data = pd.read_csv('repos/final-project/twitter_validation.csv', header=None)

In [58]:
# Rename columns headers

training_data.columns = ['TweetID', 'Entity', 'Sentiment', 'Tweet']
validation_data.columns = ['TweetID', 'Entity', 'Sentiment', 'Tweet']

print(training_data.head())
print(validation_data.head())

   TweetID       Entity Sentiment  \
0     2401  Borderlands  Positive   
1     2401  Borderlands  Positive   
2     2401  Borderlands  Positive   
3     2401  Borderlands  Positive   
4     2401  Borderlands  Positive   

                                               Tweet  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
   TweetID     Entity   Sentiment  \
0     3364   Facebook  Irrelevant   
1      352     Amazon     Neutral   
2     8312  Microsoft    Negative   
3     4371      CS-GO    Negative   
4     4433     Google     Neutral   

                                               Tweet  
0  I mentioned on Facebook that I was struggling ...  
1  BBC News - Amazon boss Jeff Bezos rejects clai...  
2  @Microsoft Why do I pay for WORD when it funct...  
3  C

In [60]:
#Mapping Sentiment Labels to Numbers

convert_sentiment = {'Positive': 0, 'Negative': 1, 'Neutral': 2}
training_data['label'] = training_data['Sentiment'].map(convert_sentiment)
validation_data['label'] = validation_data['Sentiment'].map(convert_sentiment)

In [62]:
#Combine Enity and Tweet into single Text

training_data['text'] = training_data['Entity'] + " " + training_data['Tweet']
validation_data['text'] = validation_data['Entity'] + " " + validation_data['Tweet']

#Drop missing or invalid data 
training_data = training_data.dropna(subset=['text', 'label'])
validation_data = validation_data.dropna(subset=['text', 'label'])

print(training_data.head())
print(validation_data.head())

   TweetID       Entity Sentiment  \
0     2401  Borderlands  Positive   
1     2401  Borderlands  Positive   
2     2401  Borderlands  Positive   
3     2401  Borderlands  Positive   
4     2401  Borderlands  Positive   

                                               Tweet  label  \
0  im getting on borderlands and i will murder yo...    0.0   
1  I am coming to the borders and I will kill you...    0.0   
2  im getting on borderlands and i will kill you ...    0.0   
3  im coming on borderlands and i will murder you...    0.0   
4  im getting on borderlands 2 and i will murder ...    0.0   

                                                text  
0  Borderlands im getting on borderlands and i wi...  
1  Borderlands I am coming to the borders and I w...  
2  Borderlands im getting on borderlands and i wi...  
3  Borderlands im coming on borderlands and i wil...  
4  Borderlands im getting on borderlands 2 and i ...  
   TweetID     Entity Sentiment  \
1      352     Amazon   Neutral  

In [64]:
#Creating TD-IDF Feature

vectorizer = TfidfVectorizer(max_features=5000)

X_training_data = vectorizer.fit_transform(training_data['text'])
X_validation_data = vectorizer.transform(validation_data['text'])

y_train = training_data['label']
y_val = validation_data['label']

In [76]:
# Initialize and train the model
model = LogisticRegression(max_iter=1000)
model.fit(X_training_data, y_train)

In [80]:
# Predict on validation set
y_pred = model.predict(X_validation_data)

# Calculate Top-1 Accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

# Detailed report
print(classification_report(y_val, y_pred, target_names=['Positive', 'Negative', 'Neutral']))

Validation Accuracy: 0.8587
              precision    recall  f1-score   support

    Positive       0.86      0.88      0.87       277
    Negative       0.83      0.90      0.86       266
     Neutral       0.88      0.80      0.84       285

    accuracy                           0.86       828
   macro avg       0.86      0.86      0.86       828
weighted avg       0.86      0.86      0.86       828

