Mounted at /content/drive


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split

# Function to clean the tweet text
def preprocess_text(text):
    # Convert to lower case
    text = text.lower()
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove user @ references and '#' from tweet
    text = re.sub(r'\@\w+|\#','', text)
    # Remove punctuations
    text = re.sub(r'[^\w\s]', '', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove excess whitespace
    text = text.strip()
    # Debug statement to check the preprocessing
    print(f"Preprocessed text: {text}")
    return text

# Load dataset
file_path = '/content/tweets - tweets.csv'

df = pd.read_csv(file_path)

# Apply preprocessing to the 'tweet' column
df['tweet'] = df['tweet'].apply(preprocess_text)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df['tweet'], df['label'], test_size=0.2)

# Display the first few rows of the processed data
print(df.head())


In [9]:
!pip install nltk
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import string

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_tweet(tweet):
    # Tokenization
    tokens = word_tokenize(tweet)

    # Lowercasing
    tokens = [token.lower() for token in tokens]

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Remove punctuation and non-alphabetic tokens
    tokens = [token for token in tokens if token.isalpha()]

    # Rejoin tokens into a single string
    clean_tweet = ' '.join(tokens)

    # Debug statement
    print(f"Original: {tweet}\nProcessed: {clean_tweet}\n")

    return clean_tweet

df['processed_tweet'] = df['tweet'].apply(preprocess_tweet)

# Display the DataFrame to verify results
print(df[['tweet', 'processed_tweet']])


In [11]:
!pip install scikit-learn gensim




In [12]:
import gensim.downloader as api

# Download the model and return as object ready for use
model_glove_twitter = api.load("glove-twitter-25")




In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

# Step 1: BoW Vectorization
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(df['processed_tweet'])

# Step 2: TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_tweet'])

# Step 3: GloVe Word Embeddings
def get_embedding_vector(tweet, model):
    # Tokenize the tweet
    words = tweet.split()

    # Retrieve the vector for each word in the tweet and take the mean to represent the string of words
    embeddings = [model[word] for word in words if word in model]
    if embeddings:
        embeddings = np.mean(embeddings, axis=0)
    else:
        # If the tweet contains no words with embeddings, return zeros
        embeddings = np.zeros(model.vector_size)
    return embeddings

# Apply the function to each tweet to get the embedding vector
df['embedding_vector'] = df['processed_tweet'].apply(lambda tweet: get_embedding_vector(tweet, model_glove_twitter))

# Convert the list of vectors into a numpy matrix, which can be used as input to ML models
embedding_matrix = np.array(list(df['embedding_vector']))


In [14]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from scipy.sparse import hstack


# 4. Splitting the Dataset
# Concatenate BoW and TF-IDF features
X_combined = hstack([X_bow, X_tfidf])
y = df['label']

# Split the combined features
X_train_combined, X_test_combined, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

# Split the GloVe features
X_train_glove, X_test_glove, _, _ = train_test_split(embedding_matrix, y, test_size=0.2, random_state=42)

# 5. Model Selection
clf_combined = RandomForestClassifier(random_state=42)
clf_glove = RandomForestClassifier(random_state=42)

# 6. Model Training
clf_combined.fit(X_train_combined, y_train)
clf_glove.fit(X_train_glove, y_train)

# 7. Model Evaluation
# Predict using the combined BoW and TF-IDF model
pred_combined = clf_combined.predict(X_test_combined)
# Predict using the GloVe model
pred_glove = clf_glove.predict(X_test_glove)

# 8. Hyperparameter Tuning
# Here we will use a simple grid search on the combined model as an example
# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=clf_combined, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_combined, y_train)

# Get the best estimator
best_clf_combined = grid_search.best_estimator_

# 9. Model Testing
# Make predictions with the best model
best_pred_combined = best_clf_combined.predict(X_test_combined)

# Combine predictions from the two models (e.g., by averaging their prediction probabilities)
final_pred = np.round((best_pred_combined + pred_glove) / 2).astype(int)

# Calculate the final accuracy
final_accuracy = accuracy_score(y_test, final_pred)
print(f"Final Accuracy: {final_accuracy}")


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Final Accuracy: 0.8465909090909091


In [15]:
# using ANN
import numpy as np
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Input, Dense, Dropout, concatenate
from keras.utils import to_categorical
from scipy.sparse import hstack
from keras.callbacks import EarlyStopping

# Assuming you have X_bow, X_tfidf, and embedding_matrix ready from the previous steps
# Also assuming y is a numpy array of your labels

# One-hot encode the labels
y_encoded = to_categorical(y)

# 4. Splitting the Dataset
# Concatenate BoW and TF-IDF features
X_combined = hstack([X_bow, X_tfidf])

# Split the combined features into train and test sets
X_train_combined, X_test_combined, y_train, y_test = train_test_split(X_combined, y_encoded, test_size=0.2, random_state=42)

# Split the GloVe features into train and test sets
X_train_glove, X_test_glove, _, _ = train_test_split(embedding_matrix, y_encoded, test_size=0.2, random_state=42)

# Convert the combined sparse matrix to a dense one for Keras
X_train_combined = X_train_combined.todense()
X_test_combined = X_test_combined.todense()

# 5. Model Definition for Combined BoW and TF-IDF
input_combined = Input(shape=(X_train_combined.shape[1],))
dense_combined = Dense(64, activation='relu')(input_combined)
dropout_combined = Dropout(0.5)(dense_combined)
output_combined = Dense(y_encoded.shape[1], activation='softmax')(dropout_combined)
model_combined = Model(inputs=input_combined, outputs=output_combined)

# Model Definition for GloVe
input_glove = Input(shape=(X_train_glove.shape[1],))
dense_glove = Dense(64, activation='relu')(input_glove)
dropout_glove = Dropout(0.5)(dense_glove)
output_glove = Dense(y_encoded.shape[1], activation='softmax')(dropout_glove)
model_glove = Model(inputs=input_glove, outputs=output_glove)

# Compile the models
model_combined.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model_glove.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

# 6. Model Training
model_combined.fit(X_train_combined, y_train, validation_split=0.1, epochs=10, batch_size=32, callbacks=[early_stopping])
model_glove.fit(X_train_glove, y_train, validation_split=0.1, epochs=10, batch_size=32, callbacks=[early_stopping])

# 7. Model Evaluation
pred_combined = model_combined.predict(X_test_combined)
pred_glove = model_glove.predict(X_test_glove)

# 9. Model Testing
# Averaging predictions from both models
final_pred = (pred_combined + pred_glove) / 2
final_pred_labels = np.argmax(final_pred, axis=1)

# Convert one-hot encoded y_test back to labels
y_test_labels = np.argmax(y_test, axis=1)

# Calculate the final accuracy
final_accuracy = np.mean(final_pred_labels == y_test_labels)
print(f"Final Accuracy: {final_accuracy}")


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Final Accuracy: 0.889520202020202
