##Sentiment Analysis

## Data loading
load the dataset


In [21]:
import pandas as pd

df = pd.read_csv('twitter_training.csv')
display(df.head())

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


## Data exploration




In [22]:
# Examine the DataFrame's shape
print("DataFrame shape:", df.shape)

# Check the data types of each column
print("\nData types:\n", df.dtypes)

# Identify missing values
print("\nMissing values:\n", df.isnull().sum())

# Analyze the distribution of sentiment labels
print("\nSentiment label distribution:\n", df['Positive'].value_counts())

# Examine the text data
print("\nSample tweets:\n", df['im getting on borderlands and i will murder you all ,'].head(5))

DataFrame shape: (74681, 4)

Data types:
 2401                                                      int64
Borderlands                                              object
Positive                                                 object
im getting on borderlands and i will murder you all ,    object
dtype: object

Missing values:
 2401                                                       0
Borderlands                                                0
Positive                                                   0
im getting on borderlands and i will murder you all ,    686
dtype: int64

Sentiment label distribution:
 Positive
Negative      22542
Positive      20831
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64

Sample tweets:
 0    I am coming to the borders and I will kill you...
1    im getting on borderlands and i will kill you ...
2    im coming on borderlands and i will murder you...
3    im getting on borderlands 2 and i will murder ...
4    im getting into borderla

## Data cleaning
Handling missing values,converting to lowercase, removing irrelevant characters, tokenizing, removing stop words, and performing stemming/lemmatization.


In [23]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

nltk.download('stopwords', quiet=True)

# 1. Handle missing values
df['im getting on borderlands and i will murder you all ,'] = df['im getting on borderlands and i will murder you all ,'].fillna('')

# 2. Convert to lowercase
df['im getting on borderlands and i will murder you all ,'] = df['im getting on borderlands and i will murder you all ,'].str.lower()

# 3. Remove irrelevant characters
df['im getting on borderlands and i will murder you all ,'] = df['im getting on borderlands and i will murder you all ,'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))

# 4. Tokenize
df['tokenized_tweet'] = df['im getting on borderlands and i will murder you all ,'].apply(lambda x: x.split())

# 5. Remove stop words
stop_words = set(stopwords.words('english'))
df['tokenized_tweet'] = df['tokenized_tweet'].apply(lambda x: [word for word in x if word not in stop_words])

# 6. Perform stemming
stemmer = PorterStemmer()
df['stemmed_tweet'] = df['tokenized_tweet'].apply(lambda x: [stemmer.stem(word) for word in x])

display(df.head())

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,",tokenized_tweet,stemmed_tweet
0,2401,Borderlands,Positive,i am coming to the borders and i will kill you...,"[coming, borders, kill]","[come, border, kill]"
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you all,"[im, getting, borderlands, kill]","[im, get, borderland, kill]"
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...,"[im, coming, borderlands, murder]","[im, come, borderland, murder]"
3,2401,Borderlands,Positive,im getting on borderlands and i will murder y...,"[im, getting, borderlands, murder]","[im, get, borderland, murder]"
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...,"[im, getting, borderlands, murder]","[im, get, borderland, murder]"


## Feature engineering
Extract features from the cleaned tweet text using TF-IDF.


In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert the list of stemmed words back into a string for each tweet
df['stemmed_tweet_str'] = df['stemmed_tweet'].apply(lambda x: ' '.join(x))

# Instantiate a TfidfVectorizer object
tfidf_vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Fit the TfidfVectorizer to the stemmed_tweet_str column
tfidf_vectorizer.fit(df['stemmed_tweet_str'])

# Transform the stemmed_tweet_str column into a TF-IDF matrix
tfidf_matrix = tfidf_vectorizer.transform(df['stemmed_tweet_str'])

# Explore the vocabulary extracted by the TfidfVectorizer (optional)
# print(tfidf_vectorizer.vocabulary_)

## Data splitting
Split the dataset into training and testing sets.


In [25]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, df['Positive'], test_size=0.2, random_state=42)

## Model training
Train a Logistic Regression model for sentiment classification.


In [26]:
from sklearn.linear_model import LogisticRegression

# Instantiate a LogisticRegression object
model = LogisticRegression(solver='lbfgs', max_iter=1000)

# Train the model on the training data
model.fit(X_train, y_train)

## Model evaluation
Evaluate the trained Logistic Regression model's performance on the testing set.


In [27]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Make predictions on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Calculate precision, recall, and F1-score
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Accuracy: 0.6732
Precision: 0.6733
Recall: 0.6732
F1-Score: 0.6708


## Model optimization
Optimize the Logistic Regression model to improve its accuracy.


In [28]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear'],
    'max_iter': [1000]
}

# Create a LogisticRegression model
model = LogisticRegression()

# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters:", grid_search.best_params_)
print("Best accuracy:", grid_search.best_score_)

# Use the best model to predict the test set
y_pred = grid_search.best_estimator_.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1_score:.4f}")

Best parameters: {'C': 10, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}
Best accuracy: 0.6990994899845379
Accuracy: 0.7030
Precision: 0.7035
Recall: 0.7030
F1-Score: 0.7013
