TF-IDF with Logistic Regression

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import string
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from tkinter.constants import X
from sklearn.model_selection import train_test_split

nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

from google.colab import drive
drive.mount('/content/drive/')

tw_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/My Folder/tweeter_training (3).csv', header=None, encoding="ISO-8859-1")
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tw_df.columns = column_names
tweet_column_index = 5


# Preprocessing function with lemmatization
def my_preprocessor(text):
    tweets_processed = []
    text = text.lower()
    tokens = TweetTokenizer().tokenize(text)
    handle_pattern = re.compile(r'@\w+')  # Remove Twitter handles
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')  # Remove URLs
    tokens_cleaned = [token for token in tokens if not handle_pattern.match(token) and not url_pattern.match(token)]
    tokens_no_punc = [token for token in tokens_cleaned if token not in string.punctuation]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens_no_punc]
    tokens_wo_stopword = [token for token in lemmatized_tokens if token not in stop_words]
    text_preprocessed = ' '.join(tokens_wo_stopword)
    return text_preprocessed

#sample data
sample_size_per_class = 50000
pos_samples = tw_df[tw_df['target'] == 4].sample(sample_size_per_class, random_state=2)
neg_samples = tw_df[tw_df['target'] == 0].sample(sample_size_per_class, random_state=2)

all_samples = pd.concat([pos_samples, neg_samples])
all_samples['target'].value_counts()

#train and test split

X_train, X_test, y_train, y_test = train_test_split(all_samples['text'], all_samples['target'], test_size=0.2, random_state=2)

vectorizer = TfidfVectorizer(max_features=5000, preprocessor= my_preprocessor)

# a. Train the TF-IDF Vectorizer using the training data,
X_train_tfidf = vectorizer.fit_transform(X_train)

# b. Apply the vectorizer to both the training and testing data
X_test_tfidf = vectorizer.transform(X_test)

# c. Convert the sparse matrix from the vectorization into an array matrix
X_train_vectorized = X_train_tfidf.toarray()
X_test_vectorized = X_test_tfidf.toarray()


classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

accuracy = accuracy_score(y_test, y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Mounted at /content/drive/
Logistic Regression Accuracy: 0.75895


GloVe-200-Twitter with Logistic Regression

In [None]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import gensim.downloader as api
from google.colab import drive

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Mount Google Drive
drive.mount('/content/drive/')

# Load Twitter dataset
tw_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/My Folder/tweeter_training (3).csv', encoding='ISO-8859-1', header=None)
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tw_df.columns = column_names

# Initialize stopwords
stop_words = set(stopwords.words('english'))

def my_preprocessor(text):
    tweets_processed = []
    text = text.lower()
    tokens = TweetTokenizer().tokenize(text)
    handle_pattern = re.compile(r'@\w+')  # Remove Twitter handles
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')  # Remove URLs
    tokens_cleaned = [token for token in tokens if not handle_pattern.match(token) and not url_pattern.match(token)]
    tokens_no_punc = [token for token in tokens_cleaned if token not in string.punctuation]
    tokens_wo_stopword = [token for token in tokens_no_punc if token not in stop_words]
    text_preprocessed = ' '.join(tokens_wo_stopword)
    return text_preprocessed

# Preprocess tweets
tw_df['processed_text'] = tw_df['text'].apply(my_preprocessor)

# Sample dataset
sample_size_per_class = 50000
pos_samples = tw_df[tw_df['target'] == 4].sample(sample_size_per_class, random_state=2)
neg_samples = tw_df[tw_df['target'] == 0].sample(sample_size_per_class, random_state=2)
all_samples = pd.concat([pos_samples, neg_samples])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(all_samples['processed_text'], all_samples['target'], test_size=0.2, random_state=2)

# Load pre-trained GloVe model
word_embedding_model = api.load("glove-twitter-200")

# Function to compute average GloVe embeddings
def get_embeddings_avg(texts, model):
    texts_embedding_avg = []
    for text in texts:
        tokens = word_tokenize(text)
        word_vectors = [model[word] for word in tokens if word in model.key_to_index]
        text_embedding_avg = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)
        texts_embedding_avg.append(text_embedding_avg)
    return np.array(texts_embedding_avg)

# Compute GloVe embeddings for train and test sets
X_train_glove = get_embeddings_avg(X_train, word_embedding_model)
X_test_glove = get_embeddings_avg(X_test, word_embedding_model)

# Train logistic regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_glove, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_glove)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Logistic Regression Accuracy:", accuracy)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Logistic Regression Accuracy: 0.7441


TF-IDF with Random Forest

In [None]:
import pandas as pd
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
import string
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from tkinter.constants import X
from sklearn.model_selection import train_test_split

nltk.download('wordnet')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

from google.colab import drive
drive.mount('/content/drive/')

tw_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/My Folder/tweeter_training (3).csv', header=None, encoding="ISO-8859-1")
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tw_df.columns = column_names
tweet_column_index = 5


# Preprocessing function with lemmatization
def my_preprocessor(text):
    tweets_processed = []
    text = text.lower()
    tokens = TweetTokenizer().tokenize(text)
    handle_pattern = re.compile(r'@\w+')  # Remove Twitter handles
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')  # Remove URLs
    tokens_cleaned = [token for token in tokens if not handle_pattern.match(token) and not url_pattern.match(token)]
    tokens_no_punc = [token for token in tokens_cleaned if token not in string.punctuation]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens_no_punc]
    tokens_wo_stopword = [token for token in lemmatized_tokens if token not in stop_words]
    text_preprocessed = ' '.join(tokens_wo_stopword)
    return text_preprocessed

#sample data
sample_size_per_class = 50000
pos_samples = tw_df[tw_df['target'] == 4].sample(sample_size_per_class, random_state=2)
neg_samples = tw_df[tw_df['target'] == 0].sample(sample_size_per_class, random_state=2)

all_samples = pd.concat([pos_samples, neg_samples])
all_samples['target'].value_counts()

#train and test split
X_train, X_test, y_train, y_test = train_test_split(all_samples['text'], all_samples['target'], test_size=0.2, random_state=2)

vectorizer = TfidfVectorizer(max_features=5000, preprocessor= my_preprocessor)

# a. Train the TF-IDF Vectorizer using the training data,
X_train_tfidf = vectorizer.fit_transform(X_train)

# b. Apply the vectorizer to both the training and testing data
X_test_tfidf = vectorizer.transform(X_test)

# Train Random Forest Classifier model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_tfidf, y_train)

# Predict using Random Forest
y_pred_rf = rf_classifier.predict(X_test_tfidf)

# Evaluate Model
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Random Forest Performance:
Accuracy: 0.74495


GloVe-200-Twitter with Random Forest

In [None]:
import numpy as np
import pandas as pd
import re
import string
import nltk
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import gensim.downloader as api
from google.colab import drive

# Download necessary NLTK resources
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt_tab')

# Mount Google Drive
drive.mount('/content/drive/')

# Load Twitter dataset
tw_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/My Folder/tweeter_training (3).csv', encoding='ISO-8859-1', header=None)
column_names = ['target', 'ids', 'date', 'flag', 'user', 'text']
tw_df.columns = column_names

# Initialize stopwords
stop_words = set(stopwords.words('english'))

def my_preprocessor(text):
    tweets_processed = []
    text = text.lower()
    tokens = word_tokenize().tokenize(text)
    handle_pattern = re.compile(r'@\w+')  # Remove Twitter handles
    url_pattern = re.compile(r'http[s]?://\S+|www\.\S+')  # Remove URLs
    tokens_cleaned = [token for token in tokens if not handle_pattern.match(token) and not url_pattern.match(token)]
    tokens_no_punc = [token for token in tokens_cleaned if token not in string.punctuation]
    tokens_wo_stopword = [token for token in tokens_no_punc if token not in stop_words]
    text_preprocessed = ' '.join(tokens_wo_stopword)
    return text_preprocessed

# Preprocess tweets
tw_df['processed_text'] = tw_df['text'].apply(my_preprocessor)

# Sample dataset
sample_size_per_class = 50000
pos_samples = tw_df[tw_df['target'] == 4].sample(sample_size_per_class, random_state=2)
neg_samples = tw_df[tw_df['target'] == 0].sample(sample_size_per_class, random_state=2)
all_samples = pd.concat([pos_samples, neg_samples])

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(all_samples['processed_text'], all_samples['target'], test_size=0.2, random_state=2)

# Load pre-trained GloVe model
word_embedding_model = api.load("glove-twitter-200")

# Function to compute average GloVe embeddings
def get_embeddings_avg(texts, model):
    texts_embedding_avg = []
    for text in texts:
        tokens = word_tokenize(text)
        word_vectors = [model[word] for word in tokens if word in model.key_to_index]
        text_embedding_avg = np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.vector_size)
        texts_embedding_avg.append(text_embedding_avg)
    return np.array(texts_embedding_avg)

# Compute GloVe embeddings for train and test sets
X_train_glove = get_embeddings_avg(X_train, word_embedding_model)
X_test_glove = get_embeddings_avg(X_test, word_embedding_model)

# Train Random Forest Classifier model
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train_glove, y_train)

# Predict using Random Forest
y_pred_rf = rf_classifier.predict(X_test_glove)

# Evaluate Model
print("Random Forest Performance:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
Random Forest Performance:
Accuracy: 0.7238
