In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Define necessary functions (normalize, sigmoid, gradients, loss, train, predict)

def normalize(X):
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def gradients(X, y, y_hat):
    m = X.shape[0]
    dw = np.dot(X.T, (y_hat - y)) / m
    db = np.sum(y_hat - y) / m
    return dw, db

def loss(y, y_hat):
    m = y.shape[0]
    return -1/m * np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))

def train(X, y, bs, epochs, lr):
    m, n = X.shape
    w = np.zeros((n, 1))
    b = 0
    y = y.reshape(m, 1)
    X = normalize(X)
    losses = []
    for epoch in range(epochs):
        for i in range((m-1) // bs + 1):
            start_i = i * bs
            end_i = start_i + bs
            xb = X[start_i:end_i]
            yb = y[start_i:end_i]
            y_hat = sigmoid(np.dot(xb, w) + b)
            dw, db = gradients(xb, yb, y_hat)
            w -= lr * dw
            b -= lr * db
        l = loss(y, sigmoid(np.dot(X, w) + b))
        losses.append(l)
    return w, b, losses

def predict(X, w, b):
    x = normalize(X)
    preds = sigmoid(np.dot(x, w) + b)
    pred_class = [1 if i >= 0.5 else 0 for i in preds]
    return np.array(pred_class)

def accuracy(y, y_hat):
    accuracy = np.sum(y == y_hat) / len(y)
    return accuracy


In [8]:
df = pd.read_csv('train_tfidf_features.csv')

X = df.drop(['label', 'id'], axis=1)
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

In [9]:
def apply_pca(n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

# Separate features and labels
X = df.drop('label', axis=1)  # Replace 'label_column' with the actual label column name
y = df['label']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to apply PCA
def apply_pca(n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

# Apply PCA for different component sizes
components = [2000, 1000, 500, 100]
pca_results = {n: apply_pca(n) for n in components}

# Train KNN and evaluate
knn = KNeighborsClassifier(n_neighbors=2)

def train_and_evaluate(X_train_pca, X_test_pca):
    knn.fit(X_train_pca, y_train)
    y_pred = knn.predict(X_test_pca)
    return y_pred

results = {}
for n, (X_train_pca, X_test_pca) in pca_results.items():
    y_pred = train_and_evaluate(X_train_pca, X_test_pca)
    f1 = f1_score(y_test, y_pred, average='macro')
    results[n] = f1

# Print results
for n, f1 in results.items():
    print(f"PCA Components: {n}, Macro F1 Score: {f1}")

# Save predictions for Kaggle submission (example for 100 components)
y_pred_100 = train_and_evaluate(*pca_results[100])
submission = pd.DataFrame({'Id': X_test.index, 'Prediction': y_pred_100})
submission.to_csv('submission_100_components.csv', index=False)


In [3]:
# Assuming train_df and test_df are your DataFrames
train_df = pd.read_csv('train_tfidf_features.csv')
test_df = pd.read_csv('test_tfidf_features.csv')

# Prepare training data
X_train = train_df.drop(['label', 'id'], axis=1).values
y_train = train_df['label'].values

# Call the training function
w, b, l = train(X_train, y_train, bs=100, epochs=10, lr=0.01)

# Predictions and accuracy on training data
y_train_pred = predict(X_train, w, b)
print(f"Training Accuracy: {accuracy(y_train, y_train_pred)}")

  return (X - np.mean(X, axis=0)) / np.std(X, axis=0)


Training Accuracy: 0.6187732774674115


In [6]:
# Calculate F1 score
f1 = f1_score(y_train, y_train_pred, average='macro')
print(f"F1 Score: {f1}")

F1 Score: 0.3822482654491858


In [4]:
# Prepare test data
X_test = test_df.drop(['label', 'id'], axis=1).values
y_test = test_df['label'].values

# Predictions and accuracy on test data
y_test_pred = predict(X_test, w, b)
print(f"Test Accuracy: {accuracy(y_test, y_test_pred)}")

# Calculate F1 score
f1 = f1_score(y_test, y_test_pred, average='macro')
print(f"F1 Score: {f1}")

KeyError: "['label'] not found in axis"

In [18]:
!pip install xgboost
!pip install lightgbm


Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m
Defaulting to user installation because normal site-packages is not writeable
Collecting lightgbm
  Downloading lightgbm-4.4.0.tar.gz (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m26.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (pyproject.toml) ... [?25l/

In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.decomposition import PCA, KernelPCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
from joblib import Parallel, delayed
import lightgbm as lgb

# Load the dataset
df = pd.read_csv('train_tfidf_features.csv')

# Separate features and labels
X = df.drop('label', axis=1)  # Replace 'label' with the actual label column name
y = df['label']

# Normalize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Function to apply PCA or Kernel PCA
def apply_pca(X, n_components, kernel=None):
    if kernel:
        pca = KernelPCA(n_components=n_components, kernel=kernel)
    else:
        pca = PCA(n_components=n_components)
    X_pca = pca.fit_transform(X)
    return X_pca

# List of PCA components to evaluate
components = [2000, 1000, 500, 100]

# Function to perform grid search for KNN
def grid_search_knn(X_pca, y):
    param_grid = {'n_neighbors': [1, 2, 3, 5, 7, 10]}
    grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, scoring='f1_macro', cv=5)
    grid_search.fit(X_pca, y)
    return grid_search.best_estimator_

# Function to evaluate a model with k-fold cross-validation
def evaluate_model_kfold(model, X_pca, y, k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    f1_scores = []
    for train_index, test_index in kf.split(X_pca):
        X_train, X_test = X_pca[train_index], X_pca[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        f1_scores.append(f1_score(y_test, y_pred, average='macro'))
    return np.mean(f1_scores)

# Function to train and evaluate models
def train_and_evaluate(X, y, n_components, kernel=None, k=5):
    X_pca = apply_pca(X, n_components, kernel)
    
    # KNN
    best_knn = grid_search_knn(X_pca, y)
    knn_f1 = evaluate_model_kfold(best_knn, X_pca, y, k)
    
    # Random Forest
    rf_f1 = evaluate_model_kfold(RandomForestClassifier(), X_pca, y, k)
    
    # SVM
    svm_f1 = evaluate_model_kfold(SVC(), X_pca, y, k)
    
    # XGBoost
    xgb_f1 = evaluate_model_kfold(XGBClassifier(), X_pca, y, k)
    
    # LightGBM
    lgb_f1 = evaluate_model_kfold(lgb.LGBMClassifier(), X_pca, y, k)
    
    return n_components, knn_f1, rf_f1, svm_f1, xgb_f1, lgb_f1

# Parallel computation for all components
results = Parallel(n_jobs=-1)(delayed(train_and_evaluate)(X, y, n) for n in components)

# Print results
for res in results:
    n_components, knn_f1, rf_f1, svm_f1, xgb_f1, lgb_f1 = res
    print(f"PCA Components: {n_components}")
    print(f"KNN Macro F1 Score: {knn_f1}")
    print(f"Random Forest Macro F1 Score: {rf_f1}")
    print(f"SVM Macro F1 Score: {svm_f1}")
    print(f"XGBoost Macro F1 Score: {xgb_f1}")
    print(f"LightGBM Macro F1 Score: {lgb_f1}")

# Save best KNN predictions for Kaggle submission (example for 100 components)
X_pca = apply_pca(X, 100)
best_knn = grid_search_knn(X_pca, y)
y_pred_100 = cross_val_predict(best_knn, X_pca, y, cv=KFold(n_splits=5, shuffle=True, random_state=42))
submission = pd.DataFrame({'Id': np.arange(len(y)), 'Prediction': y_pred_100})
submission.to_csv('submission_100_components.csv', index=False)


ModuleNotFoundError: No module named 'lightgbm'

In [1]:
!pip install lightgbm

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.1[0m[39;49m -> [0m[32;49m24.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [3]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, classification_report
from scipy.stats import uniform, randint

# Load the dataset
df = pd.read_csv('train_tfidf_features.csv')

# Prepare the data
df_feature = df.drop(['label', 'id'], axis=1)
df_target = df['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df_feature, df_target, test_size=0.2, random_state=42, stratify=df_target)

# Define the refined parameter distribution for RandomizedSearchCV
param_dist = {
    'n_estimators': randint(480, 520),  # Narrowed range around 500
    'learning_rate': uniform(0.08, 0.12),  # Narrowed range around 0.1
    'num_leaves': randint(85, 95),  # Narrowed range around 91
    'max_depth': randint(6, 11),  # Keeping the same range
    'subsample': uniform(0.9, 0.1),  # Keeping the same range
    'colsample_bytree': uniform(0.95, 0.05)  # Keeping the same range
}

# Initialize the LightGBM model with the best parameters as defaults
lgbm = LGBMClassifier(objective='binary', random_state=42, n_estimators=500, learning_rate=0.1, num_leaves=91)

# Use RandomizedSearchCV to find the best hyperparameters within the refined range
random_search = RandomizedSearchCV(estimator=lgbm,
                                   param_distributions=param_dist,
                                   scoring='f1_macro',
                                   n_iter=30,
                                   cv=5,
                                   verbose=2,
                                   n_jobs=-1,
                                   random_state=42)

# Fit the model
random_search.fit(X_train, y_train)

# Print the best parameters found by RandomizedSearchCV
print(f'Best parameters found: {random_search.best_params_}')

# Use the best estimator to make predictions
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

# Calculate and print the Macro F1 Score
macro_f1 = f1_score(y_test, y_pred, average='macro')
print(f'Macro F1 Score after RandomizedSearchCV: {macro_f1}')

# Print the classification report
print(classification_report(y_test, y_pred, target_names=['Non-Hateful', 'Hateful']))

Fitting 5 folds for each of 30 candidates, totalling 150 fits
[LightGBM] [Info] Number of positive: 4193, number of negative: 6804[LightGBM] [Info] Number of positive: 4193, number of negative: 6805

[LightGBM] [Info] Number of positive: 4192, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6804
[LightGBM] [Info] Number of positive: 4192, number of negative: 6805
[LightGBM] [Info] Number of positive: 4193, number of negative: 6805
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027577 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18883
[LightGBM] [Info] Number of data points in the train set: 10997, number of used features: 805
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.381195 -> initscore=-0.484480
[LightGBM] [Inf