# HYPERPARAMETER TUNING FOR TRADITIONAL ALGORITHIMS OF ML

### IMPORT PACKAGES & FUNCTIONS

In [1]:
import re
import pandas as pd
import sklearn.svm as svm
from nltk import download
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

download('wordnet')
download('omw-1.4')
download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
def preprocessing(text):
    text = text.lower()
    text_cleaned = re.sub(r'[^0-9a-z_+\-*]', ' ', text).strip()
    lemm = WordNetLemmatizer()
    title = []
    for token in text_cleaned.split():
        token_lemm = lemm.lemmatize(token)
        if token_lemm not in stopwords.words('english'):
             title.append(lemm.lemmatize(token))
    return ' '.join(str(elem) for elem in title)

##IMPORT DATA FROM LOCAL FILES
Files to upload:
- test_category.csv
- test_data.csv
- train_category.csv
- train_data.csv

In [3]:
from google.colab import files
uploaded = files.upload()

Saving test_category.csv to test_category.csv
Saving test_data.csv to test_data.csv
Saving train_category.csv to train_category.csv
Saving train_data.csv to train_data.csv


In [4]:
train_data_df = pd.read_csv('/content/train_data.csv')
test_data_df = pd.read_csv('/content/test_data.csv')
train_category_df = pd.read_csv('/content/train_category.csv')
test_category_df = pd.read_csv('/content/test_category.csv')

In [5]:
train_data = train_data_df['headline']
test_data = test_data_df['headline']
train_label = train_category_df['labels'] 
test_label = test_category_df['labels']

## DATA PREPARATION

In [6]:
train_data = train_data.apply(lambda x: preprocessing(str(x)))
test_data = test_data.apply(lambda x: preprocessing(str(x)))

In [7]:
encoder = LabelEncoder()
train_label = encoder.fit_transform(train_label)
test_label = encoder.fit_transform(test_label)

In [8]:
tfidf_vector = TfidfVectorizer()
tfidf_vector.fit(train_data)
train_data_tfidf = tfidf_vector.transform(train_data)
test_data_tfidf = tfidf_vector.transform(test_data)

## SETTING MODELS AND PARAMETERS FOR TUNING
- GridSearchCV: Make a grid search over all combinations with the parameters and the models given

In [9]:
models = {
    "Random Forest": RandomForestClassifier(),
    "MNB": MultinomialNB(),
    "SVM": svm.SVC(),
    "KNN": KNeighborsClassifier()
}
params = {
    "Random Forest": {
        "n_estimators": [10, 100, 500, 1000],
        "max_features": ['auto', 'sqrt', 'log2']
    },
    "MNB": {
        "alpha": [0.0001, 0.001, 0.01, 1]
    },
    "SVM": {
        "gamma": [10, 1, 0.1, 1e-2, 1e-3],
        "C": [0.01, 0.1, 1, 10, 100, 1000]
    },
    "KNN": {
        "n_neighbors": [1, 2, 3, 5, 10, 50]
    }
}


In [10]:
random_cv_model = [
    ("Random Forest", models["Random Forest"], params["Random Forest"]),
    ("MNB", models["MNB"], params["MNB"]),
    ("SVM", models["SVM"], params["SVM"]),
    ("KNN", models["KNN"], params["KNN"])
]

In [11]:
model_param = {}

In [12]:
for name, model, params in random_cv_model:
    cv = GridSearchCV(estimator=model,
                      param_grid=params,
                      verbose=2
                      )
    cv.fit(train_data_tfidf, train_label)
    model_param[name] = cv.best_params_

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .................max_features=auto, n_estimators=10; total time=   5.0s
[CV] END .................max_features=auto, n_estimators=10; total time=   5.9s
[CV] END .................max_features=auto, n_estimators=10; total time=   5.0s
[CV] END .................max_features=auto, n_estimators=10; total time=   7.0s
[CV] END .................max_features=auto, n_estimators=10; total time=   5.4s
[CV] END ................max_features=auto, n_estimators=100; total time=  54.2s
[CV] END ................max_features=auto, n_estimators=100; total time=  54.9s
[CV] END ................max_features=auto, n_estimators=100; total time=  54.1s
[CV] END ................max_features=auto, n_estimators=100; total time=  54.5s
[CV] END ................max_features=auto, n_estimators=100; total time=  53.6s
[CV] END ................max_features=auto, n_estimators=500; total time= 4.4min
[CV] END ................max_features=auto, n_es

In [13]:
for model_name in model_param:
    print(f"------------------- Best Params for {model_name} -----------------")
    print(model_param[model_name])

------------------- Best Params for Random Forest -----------------
{'max_features': 'log2', 'n_estimators': 1000}
------------------- Best Params for MNB -----------------
{'alpha': 1}
------------------- Best Params for SVM -----------------
{'C': 10, 'gamma': 1}
------------------- Best Params for KNN -----------------
{'n_neighbors': 5}
