In [None]:
import os
# Kaggle API key
os.environ['KAGGLE_CONFIG_DIR'] = 'C:/Users/itadi/Desktop/Python'

In [None]:
import kaggle
import pandas as pd
import numpy as np
from tqdm import tqdm 
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from joblib import Parallel, delayed
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from wordcloud import WordCloud

In [None]:
# Download dataset from Kaggle
def downloadDataset():

    """
    Download dataset from Kaggle
    """

    dataset = 'kazanova/sentiment140'

    downloadPath = './dataset'
    if not os.path.exists(downloadPath):
        os.makedirs(downloadPath)

    for file in tqdm(os.listdir(downloadPath), desc="Processing files"):
        filePath = os.path.join(downloadPath, file)
        try:
            if os.path.isfile(filePath):
                os.unlink(filePath)
                print('Old files deleted successfully!')
        except Exception as e:
            print(f'Failed to delete {filePath}. Reason: {e}')
        
    kaggle.api.authenticate()
    kaggle.api.dataset_download_files(dataset, path=downloadPath, unzip=True)

    print('Dataset downloaded successfully!')

# Download dataset
downloadDataset()

In [None]:
nltk.download('stopwords');
# Print stopwords in English
# Does not add any value to the text data
print(stopwords.words('english'))

In [None]:
# Load dataset
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
dataset = pd.read_csv('./dataset/training.1600000.processed.noemoticon.csv', encoding='ISO-8859-1', header=None, names=column_names)

In [None]:
# Shape of the dataset
print(f'Shape of the dataset: {dataset.shape}')
# Display columns in the dataset
print(f'Columns in the dataset: {dataset.columns}')

In [None]:
# Display first 5 rows of the dataset
dataset.head()

In [None]:
# Check for missing values
print(f'Missing values in the dataset: \n{dataset.isnull().sum()}')

In [None]:
# Print unique values in the target column
print(f'Unique values in the target column: {dataset.target.unique()}')
# Print value counts in the target column
print(f'Value counts in the target column: \n{dataset.target.value_counts()}')

In [None]:
# Converting target values to 0 and 1
dataset['target'] = dataset['target'].replace({4: 1})

# 0: Negative sentiment
# 1: Positive sentiment

In [None]:
# Ensure the date is in datetime format
dataset['date'] = pd.to_datetime(dataset['date'])

# Extract date-related features
dataset['year'] = dataset['date'].dt.year
dataset['month'] = dataset['date'].dt.month
dataset['day'] = dataset['date'].dt.day
dataset['hour'] = dataset['date'].dt.hour
dataset['day_of_week'] = dataset['date'].dt.day_name()

# Plot tweet frequency by day
plt.figure(figsize=(10, 6))
sns.countplot(x='day_of_week', data=dataset, order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
plt.title('Tweets by Day of the Week')
plt.show()


In [None]:
# Remove URLs, special characters, and mentions
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'#\w+', '', text)     # Remove hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    return text

dataset['clean_text'] = dataset['text'].apply(clean_text)

# Preview the cleaned text
dataset[['text', 'clean_text']].head()

In [None]:
# Join all tweets into one string
all_words = ' '.join([text for text in dataset['clean_text']])

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_words)

# Plot the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of All Tweets')
plt.show()


In [None]:
# Create a new column for tweet length
dataset['tweet_length'] = dataset['clean_text'].apply(len)

# Plot the distribution of tweet lengths
plt.figure(figsize=(10, 6))
sns.histplot(dataset['tweet_length'], bins=45, kde=True)
plt.title('Distribution of Tweet Lengths')
plt.show()


In [None]:
top_users = dataset['user'].value_counts().head(10)

# Plot top users
plt.figure(figsize=(10, 6))
sns.barplot(x=top_users.index, y=top_users.values)
plt.title('Top 10 Users by Tweet Count')
plt.xticks(rotation=45)
plt.show()

### Stemming

In [None]:
# Stemming is the process of reducing a word to its root form by removing suffixes. Example: "running" -> "run"
# The Porter stemming algorithm is the most widely used method for stemming in English
# Due to the large size of the dataset, stemming helps in reducing the size of the dataset by reducing the number of unique words

In [None]:
porter = PorterStemmer()
# Precompile regex for better performance
regex = re.compile('[^a-zA-Z]')

# Stopwords as a set (for faster lookups)
stop_words = set(stopwords.words('english'))

def stemming(text):
    """
    Stemming the text
    """
    stemmed_text = regex.sub(' ', text)  # Remove special characters and numbers
    stemmed_text = stemmed_text.lower()  # Convert text to lowercase
    stemmed_text = stemmed_text.split()  # Split into words
    stemmed_text = [porter.stem(word) for word in stemmed_text if word not in stop_words]  # Stemming & stopword removal
    return ' '.join(stemmed_text)

# Parallel processing using joblib to speed up
def parallelize_dataframe(dataset, func, n_jobs=4):
    n_jobs = n_jobs if n_jobs > 0 else os.cpu_count()  # Set number of jobs
    dataset_split = np.array_split(dataset, n_jobs)  # Split dataframe
    # Apply func to each row of the dataframe
    dataset = pd.concat(Parallel(n_jobs=n_jobs)(delayed(lambda d: d.apply(func))(chunk) for chunk in dataset_split))
    return dataset


# Now pass only the stemming function, not the apply() result
dataset['stemmed_text'] = parallelize_dataframe(dataset['text'], stemming)


In [None]:
dataset.head()

In [None]:
processed_dataset = dataset[['target', 'stemmed_text']]

In [None]:
X = processed_dataset['stemmed_text']
y = processed_dataset['target']

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=22)

In [None]:
# Verify distribution of classes in training and testing sets
print(f'Training set: \n{y_train.value_counts()}')
print(f'Testing set: \n{y_test.value_counts()}')

# Visualize distribution of classes in training and testing sets
# Combine data into a DataFrame for easier plotting
train_test_counts = pd.DataFrame({
    'Training': y_train.value_counts(),
    'Testing': y_test.value_counts()
})

# Plot a grouped bar chart
train_test_counts.plot(kind='bar', figsize=(10, 6))
plt.title('Class Distribution in Training and Testing Sets')
plt.xlabel('Class')
plt.ylabel('Count')
plt.legend(loc='best')
plt.show()


In [None]:
# Convert text data to numerical data using TF-Idataset (TfidatasetVectorizer)

vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train) # fit_transform is used to learn the vocabulary from the training data and then transform it
X_test = vectorizer.transform(X_test) # transform is applied to the test data using the same learned vocabulary

#### Logistic Regression

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

In [None]:
train_predictions = model.predict(X_train)
accuracy_train_preds = accuracy_score(y_train, train_predictions)
print(f'Training accuracy: {accuracy_train_preds}')

In [None]:
test_predictions = model.predict(X_test)
accuracy_test_preds = accuracy_score(y_test, test_predictions)
print(f'Test accuracy: {accuracy_test_preds}')

In [None]:
# Save the model
model_name = 'sentiment_analysis_model.pkl'
pickle.dump(model, open(model_name, 'wb'))

#### SVM

In [None]:
# Train the SVM model
svm = SVC(kernel='linear')
svm.fit(X_train, y_train)

# Make predictions
y_pred_svm = svm.predict(X_test)

# Evaluate accuracy
print(f'SVM Accuracy: {accuracy_score(y_test, y_pred_svm)}')

#### Random Forest

In [29]:
# Train the Random Forest model
rf = RandomForestClassifier(n_estimators=100, random_state=42, verbose=1)
rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf.predict(X_test)

# Evaluate accuracy
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf)}')

#### XGBosst

In [None]:
# Train the XGBoost model
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb.predict(X_test)

# Evaluate accuracy
print(f'XGBoost Accuracy: {accuracy_score(y_test, y_pred_xgb)}')