In [2]:
!pip install pandas

Collecting pandas
  Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Using cached numpy-2.0.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Using cached pandas-2.2.2-cp312-cp312-win_amd64.whl (11.5 MB)
Using cached numpy-2.0.0-cp312-cp312-win_amd64.whl (16.2 MB)
Using cached pytz-2024.1-py2.py3-none-any.whl (505 kB)
Using cached tzdata-2024.1-py2.py3-none-any.whl (345 kB)
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully installed numpy-2.0.0 pandas-2.2.2 pytz-2024.1 tzdata-2024.1


In [4]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.5.0-cp312-cp312-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.14.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.5.0-cp312-cp312-win_amd64.whl (10.9 MB)
Using cached joblib-1.4.2-py3-none-any.whl (301 kB)
Using cached scipy-1.14.0-cp312-cp312-win_amd64.whl (44.5 MB)
Using cached threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.5.0 scipy-1.14.0 threadpoolctl-3.5.0


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

# Separate features and labels
X = df.drop('label', axis=1)  # Replace 'label_column' with the actual label column name
y = df['label']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Function to apply PCA
def apply_pca(n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

# Apply PCA for different component sizes
components = [2000, 1000, 500, 100]
pca_results = {n: apply_pca(n) for n in components}

# Train KNN and evaluate
knn = KNeighborsClassifier(n_neighbors=2)

def train_and_evaluate(X_train_pca, X_test_pca):
    knn.fit(X_train_pca, y_train)
    y_pred = knn.predict(X_test_pca)
    return y_pred

results = {}
for n, (X_train_pca, X_test_pca) in pca_results.items():
    y_pred = train_and_evaluate(X_train_pca, X_test_pca)
    f1 = f1_score(y_test, y_pred, average='macro')
    results[n] = f1

# Print results
for n, f1 in results.items():
    print(f"PCA Components: {n}, Macro F1 Score: {f1}")

# Save predictions for Kaggle submission (example for 100 components)
y_pred_100 = train_and_evaluate(*pca_results[100])
submission = pd.DataFrame({'Id': X_test.index, 'Prediction': y_pred_100})
submission.to_csv('submission_100_components.csv', index=False)


PCA Components: 2000, Macro F1 Score: 0.47577897934068236
PCA Components: 1000, Macro F1 Score: 0.47165413769915304
PCA Components: 500, Macro F1 Score: 0.473529103715535
PCA Components: 100, Macro F1 Score: 0.476917766582954


In [10]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.1.0-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.0-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   ---------------------------------------- 0.7/124.9 MB 20.8 MB/s eta 0:00:06
    --------------------------------------- 1.8/124.9 MB 23.2 MB/s eta 0:00:06
    --------------------------------------- 3.1/124.9 MB 24.6 MB/s eta 0:00:05
   - -------------------------------------- 4.2/124.9 MB 24.6 MB/s eta 0:00:05
   - -------------------------------------- 5.6/124.9 MB 25.5 MB/s eta 0:00:05
   -- ------------------------------------- 6.7/124.9 MB 25.4 MB/s eta 0:00:05
   -- ------------------------------------- 8.0/124.9 MB 25.5 MB/s eta 0:00:05
   -- ------------------------------------- 9.1/124.9 MB 25.4 MB/s eta 0:00:05
   --- ------------------------------------ 9.6/124.9 MB 25.6 MB/s eta 0:00:05
   --- ------------------------------------ 9.7/124.9 MB 22.3 MB/s eta

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score
from xgboost import XGBClassifier

# Load the dataset
df = pd.read_csv('./data/train_tfidf_features.csv')

# Separate features and labels
X = df.drop('label', axis=1)  # Replace 'label' with the actual label column name
y = df['label']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Function to apply PCA
def apply_pca(n_components):
    pca = PCA(n_components=n_components)
    X_train_pca = pca.fit_transform(X_train)
    X_test_pca = pca.transform(X_test)
    return X_train_pca, X_test_pca

# Apply PCA for different component sizes
components = [2000, 1000, 500, 100]
pca_results = {n: apply_pca(n) for n in components}

# Grid search for KNN hyperparameters
param_grid = {'n_neighbors': [1, 2, 3, 5, 7, 10]}
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, scoring='f1_macro', cv=5)

# Train KNN and evaluate
results = {}
for n, (X_train_pca, X_test_pca) in pca_results.items():
    grid_search.fit(X_train_pca, y_train)
    best_knn = grid_search.best_estimator_
    y_pred = best_knn.predict(X_test_pca)
    f1 = f1_score(y_test, y_pred, average='macro')
    results[n] = f1

# Print KNN results
for n, f1 in results.items():
    print(f"KNN with PCA Components: {n}, Macro F1 Score: {f1}")

# Evaluate with Random Forest, SVM, and XGBoost
def evaluate_model(model, X_train_pca, X_test_pca):
    model.fit(X_train_pca, y_train) 
    y_pred = model.predict(X_test_pca)
    return f1_score(y_test, y_pred, average='macro')

rf_results = {n: evaluate_model(RandomForestClassifier(), X_train_pca, X_test_pca) for n, (X_train_pca, X_test_pca) in pca_results.items()}
svm_results = {n: evaluate_model(SVC(), X_train_pca, X_test_pca) for n, (X_train_pca, X_test_pca) in pca_results.items()}
xgb_results = {n: evaluate_model(XGBClassifier(), X_train_pca, X_test_pca) for n, (X_train_pca, X_test_pca) in pca_results.items()}

# Print results for other models
print("Random Forest Results:")
for n, f1 in rf_results.items():
    print(f"PCA Components: {n}, Macro F1 Score: {f1}")

print("SVM Results:")
for n, f1 in svm_results.items():
    print(f"PCA Components: {n}, Macro F1 Score: {f1}")

print("XGBoost Results:")
for n, f1 in xgb_results.items():
    print(f"PCA Components: {n}, Macro F1 Score: {f1}")

# Save best predictions for Kaggle submission (example for 100 components)
y_pred_100 = best_knn.predict(pca_results[100][1])
submission = pd.DataFrame({'Id': X_test.index, 'Prediction': y_pred_100})
submission.to_csv('submission_100_components.csv', index=False)


KNN with PCA Components: 2000, Macro F1 Score: 0.5218488011865823
KNN with PCA Components: 1000, Macro F1 Score: 0.5176021928458235
KNN with PCA Components: 500, Macro F1 Score: 0.5527413341275035
KNN with PCA Components: 100, Macro F1 Score: 0.593241729056758
Random Forest Results:
PCA Components: 2000, Macro F1 Score: 0.5415740149442299
PCA Components: 1000, Macro F1 Score: 0.5640225365950495
PCA Components: 500, Macro F1 Score: 0.5892665313150522
PCA Components: 100, Macro F1 Score: 0.6091303306873392
SVM Results:
PCA Components: 2000, Macro F1 Score: 0.6402529331404694
PCA Components: 1000, Macro F1 Score: 0.6324128771807379
PCA Components: 500, Macro F1 Score: 0.6302789501203805
PCA Components: 100, Macro F1 Score: 0.6183301523100327
XGBoost Results:
PCA Components: 2000, Macro F1 Score: 0.6217998402477325
PCA Components: 1000, Macro F1 Score: 0.6394975168822356
PCA Components: 500, Macro F1 Score: 0.62840512362262
PCA Components: 100, Macro F1 Score: 0.6311885635259854


AttributeError: 'numpy.ndarray' object has no attribute 'index'