# Import all necessary libraries

In [1]:
import pandas as pd
import numpy as np

# baseline model libraries
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

# first model
import gensim.downloader as api
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier


# Exploring the Datasets

In [2]:
# loading datasets
train_df_main = pd.read_csv('./datasets/train.csv')
valid_df_main = pd.read_csv('./datasets/valid.csv')
test_df_main = pd.read_csv('./datasets/test.csv')

# display shapes
print(f"Train Shape: {train_df_main.shape}")
print(f"Valid Shape: {valid_df_main.shape}")
print(f"Test Shape: {test_df_main.shape}")

print("----------")

# preview training data
print(train_df_main.head())

print("----------")

# check for class balance
print(train_df_main['label'].value_counts())

print("----------")

# check for missing values
print(train_df_main.isnull().sum())

Train Shape: (21464, 2)
Valid Shape: (716, 2)
Test Shape: (966, 2)
----------
                                                text  label
0  states slow to shut down weak teacher educatio...      0
1    drone places fresh kill on steps of white house      1
2  report: majority of instances of people gettin...      1
3  sole remaining lung filled with rich, satisfyi...      1
4                       the gop's stockholm syndrome      0
----------
label
0    11248
1    10216
Name: count, dtype: int64
----------
text     0
label    0
dtype: int64


# Preprocessing all datasets

Preprocessing includes splitting apart the sentences into tokens, lowercasing all words, and making sure there is no whitespace within the sentences themselves

In [3]:
def preprocess(text):
    text = text.lower()
    tokens = text.split()
    tokens = [t for t in tokens if t]
    
    return tokens

# unprocessed text and labels
X_train = train_df_main['text']
X_valid = valid_df_main['text']
X_test = test_df_main['text']

y_train = train_df_main['label']
y_valid = valid_df_main['label']
y_test = test_df_main['label']

# processed text
X_train_processed = []
X_valid_processed = []
X_test_processed = []

for t in X_train:
    X_train_processed.append(preprocess(t))

for t in X_valid:
    X_valid_processed.append(preprocess(t))

for t in X_test:
    X_test_processed.append(preprocess(t))
    
# validating that preprocessing worked
print(X_train_processed[0:5])

[['states', 'slow', 'to', 'shut', 'down', 'weak', 'teacher', 'education', 'programs'], ['drone', 'places', 'fresh', 'kill', 'on', 'steps', 'of', 'white', 'house'], ['report:', 'majority', 'of', 'instances', 'of', 'people', 'getting', 'lives', 'back', 'on', 'track', 'occur', 'immediately', 'after', 'visit', 'to', 'buffalo', 'wild', 'wings'], ['sole', 'remaining', 'lung', 'filled', 'with', 'rich,', 'satisfying', 'flavor'], ['the', "gop's", 'stockholm', 'syndrome']]


# Baseline Model

logistic regression + tf-idf

In [32]:
# baseline model
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))

# the vectorizer uses the main dataframe
X_train_baseline = vectorizer.fit_transform(X_train)
X_valid_baseline = vectorizer.transform(X_valid)
X_test_baseline = vectorizer.transform(X_test)

baseline_model = LogisticRegression(max_iter=10000, random_state=42)
baseline_model.fit(X_train_baseline, y_train)

valid_preds = baseline_model.predict(X_valid_baseline)

print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")

Baseline Validation Accuracy: 0.7751
Baseline Validation F1 Score: 0.7663


In [4]:
# pinpointing best feature value for vectorizer
best_acc = 0
best_max_features = 4000
for i in range(1000, 10000, 100):
    temp_vectorizer = TfidfVectorizer(stop_words='english', max_features=i, ngram_range=(1, 2))
    
    X_train_baseline = temp_vectorizer.fit_transform(X_train)
    X_valid_baseline = temp_vectorizer.transform(X_valid)
    X_test_baseline = temp_vectorizer.transform(X_test)

    baseline_model = LogisticRegression(max_iter=10000, random_state=42)
    baseline_model.fit(X_train_baseline, y_train)

    valid_preds = baseline_model.predict(X_valid_baseline)

    print(f"----- TESTING i = {i} -----")
    print(f"Baseline Validation Accuracy: {accuracy_score(y_valid, valid_preds):.4f}")
    print(f"Baseline Validation F1 Score: {f1_score(y_valid, valid_preds):.4f}")
    
    if accuracy_score(y_valid, valid_preds) > best_acc:
        best_acc = accuracy_score(y_valid, valid_preds)
        best_max_features = i

----- TESTING i = 1000 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7170
----- TESTING i = 1100 -----
Baseline Validation Accuracy: 0.7249
Baseline Validation F1 Score: 0.7099
----- TESTING i = 1200 -----
Baseline Validation Accuracy: 0.7207
Baseline Validation F1 Score: 0.7041
----- TESTING i = 1300 -----
Baseline Validation Accuracy: 0.7193
Baseline Validation F1 Score: 0.7022
----- TESTING i = 1400 -----
Baseline Validation Accuracy: 0.7263
Baseline Validation F1 Score: 0.7126
----- TESTING i = 1500 -----
Baseline Validation Accuracy: 0.7277
Baseline Validation F1 Score: 0.7128
----- TESTING i = 1600 -----
Baseline Validation Accuracy: 0.7277
Baseline Validation F1 Score: 0.7178
----- TESTING i = 1700 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7236
----- TESTING i = 1800 -----
Baseline Validation Accuracy: 0.7332
Baseline Validation F1 Score: 0.7236
----- TESTING i = 1900 -----
Baseline Validation Accuracy: 0.7388
Baselin

In [5]:
print(best_acc)
print(best_max_features)

0.7793296089385475
6200


# Possible Model

Random Forests w/ Word2Vec

In [None]:
def sentence_to_vec(texts, model):
    vectors = []
    
    for text in texts:
        temp_vectors = []
        tokens = wv_preprocess(text)
        
        for token in tokens:
            if token in model:
                temp_vectors.append(model[token])
        
        if len(temp_vectors) == 0:
            vectors.append(np.zeros(model.vector_size))
        else:
            vectors.append(np.max(temp_vectors, axis=0))
    
    return np.array(vectors)

X_train = train_df['text']
X_valid = valid_df['text']
y_train = train_df['label']
y_valid = valid_df['label']
model = api.load('glove-wiki-gigaword-100')

X_train_rf = sentence_to_vec(X_train, model)

rf_model = RandomForestClassifier(
    n_estimators=100,      # Number of trees
    max_depth=20,          # Limit depth to prevent overfitting
    min_samples_split=5,   # Need at least 5 samples to split
    random_state=42,
    n_jobs=-1,             # Use all CPU cores
    verbose=1
)

rf_model.fit(X_train_rf, y_train)
print('Training done!')

X_valid_rf = sentence_to_vec(X_valid, model)
y_val_pred = rf_model.predict(X_valid_rf)

# Calculate metrics
accuracy = accuracy_score(y_valid, y_val_pred)
f1 = f1_score(y_valid, y_val_pred)

print("\nValidation Set Results:")
print(f"Accuracy: {accuracy:.4f}")
print(f"F1-Score: {f1:.4f}")




[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.3s


Training done!

Validation Set Results:
Accuracy: 0.7765
F1-Score: 0.7753


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    5.4s finished
[Parallel(n_jobs=8)]: Using backend ThreadingBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


# First Model

SVMs

In [None]:
from sklearn.svm import LinearSVC

best_vectorizer = TfidfVectorizer(stop_words='english', max_features=5200, ngram_range=(1, 2))

X_train_svm = best_vectorizer.fit_transform(X_train)
X_valid_svm = best_vectorizer.transform(X_valid)

# use linearsvc since it is better than the normal svc
# we also use td-idf with svms
svm_model = LinearSVC(random_state=42, max_iter=10000)
svm_model.fit(X_train_svm, y_train)

valid_preds_svm = svm_model.predict(X_valid_svm)

print(f"SVM Validation Accuracy: {accuracy_score(y_valid, valid_preds_svm):.4f}")
print(f"SVM Validation F1 Score: {f1_score(y_valid, valid_preds_svm):.4f}")

SVM Validation Accuracy: 0.7682
SVM Validation F1 Score: 0.7580
