<a href="https://colab.research.google.com/github/lapnguyen331/machine_leaning_lab/blob/main/Lab_7_21130419_NguyenHoangLap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# This lab deals with **GridSearchCV** for tuning the hyper-parameters of an estimator and applying vectorization techniques to the **movie reviews dataset** for classification task.

*   **Deadline: 23:59, 22/4/2024 (lớp TH thứ 3) || 29/4/2024 (lớp TH thứ 5)**



# Import libraries

In [None]:
# code
import pandas as pd
import numpy as np
from sklearn import datasets

from sklearn.model_selection import train_test_split

from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from prettytable import PrettyTable
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier





#Task 1. With **iris** dataset
*  1.1. Apply **GridSearchCV** for **SVM** to find the best hyperparameters using the following param_grid.

```
param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}
```




In [None]:
def get_metrics(AlgoName,model, X_train, y_train, X_test, y_test, average='macro'):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred, average=average)
    recall = metrics.recall_score(y_test, y_pred, average=average)
    f1 = metrics.f1_score(y_test, y_pred, average=average)
    return[AlgoName,accuracy, precision, recall, f1]

In [None]:


param_grid = {'C': [0.1, 1, 10, 100, 1000],
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf','linear']}

In [None]:
#code
iris = datasets.load_iris()
y1 = iris.target
x1  = iris.data
X1_train, X1_test, y1_train, y1_test = train_test_split(x1,y1,test_size = 0.3)

model = SVC();
print('sử dụng grid search')
grid = GridSearchCV(SVC(),param_grid= param_grid, refit=True)
grid.fit(X1_train,y1_train)
print("best-param", grid.best_params_)
print('best-estiamte',grid.best_estimator_)
# grid_predictions = grid.predict(X_test)
# print(classification_report(y_test, grid_predictions))



sử dụng grid search
best-param {'C': 1, 'gamma': 1, 'kernel': 'linear'}
best-estiamte SVC(C=1, gamma=1, kernel='linear')


*  1.2. Apply **GridSearchCV** for **kNN** to find the best hyperparameters using the following param_grid.

```
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
```
where

    *  **n_neighbors**: Decide the best k based on the values we have computed earlier.
    *  **weights**: Check whether adding weights to the data points is beneficial to the model or not. 'uniform' assigns no weight, while 'distance' weighs points by the inverse of their distances meaning nearer points will have more weight than the farther points.
    *  **metric**: The distance metric to be used will calculating the similarity.


In [None]:
#code
grid_params = { 'n_neighbors' : [5,7,9,11,13,15],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}
grid1 = GridSearchCV(KNeighborsClassifier(), grid_params, refit=True)
grid1.fit(X1_train,y1_train)
print("best-param",grid1.best_params_)
print('best-estiamte',grid1.best_estimator_)
# grid1_predictions = grid1.predict(X_test)
# print(classification_report(y_test, grid1_predictions))


best-param {'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}
best-estiamte KNeighborsClassifier(weights='distance')


*  1.3. Apply **GridSearchCV** for **Random Forest** to find the best hyperparameters using the following param_grid.

```
param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
```

In [None]:
random_param_grid = {
    'n_estimators': [25, 50, 100, 150],
    'max_features': ['sqrt', 'log2', None],
    'max_depth': [3, 6, 9],
    'max_leaf_nodes': [3, 6, 9],
}
grid2 = GridSearchCV(RandomForestClassifier(), param_grid=random_param_grid, refit=True)
grid2.fit(X1_train,y1_train)

print("best-param",grid2.best_params_)
print('best-estiamte',grid2.best_estimator_)
# grid2_predictions = grid2.predict(X_test)
# print(classification_report(y_test, grid2_predictions))

best-param {'max_depth': 3, 'max_features': 'sqrt', 'max_leaf_nodes': 3, 'n_estimators': 100}
best-estiamte RandomForestClassifier(max_depth=3, max_leaf_nodes=3)


*   1.4 Compare the best obtained results from 1.1 to 1.3 (use PrettyTable to dispaly the results)

In [None]:
table1  = PrettyTable(['Algo','accuracy', 'precision', 'recall', 'f1'])
table1.add_row(get_metrics("SVM",grid, X1_train, y1_train, X1_test, y1_test))
table1.add_row(get_metrics("KNN ",grid1, X1_train, y1_train, X1_test, y1_test))
table1.add_row(get_metrics("Random Forest",grid2, X1_train, y1_train, X1_test, y1_test))
print(table1)

+---------------+--------------------+--------------------+--------------------+--------------------+
|      Algo     |      accuracy      |     precision      |       recall       |         f1         |
+---------------+--------------------+--------------------+--------------------+--------------------+
|      SVM      | 0.9777777777777777 | 0.9803921568627452 | 0.9791666666666666 | 0.9791463017269469 |
|      KNN      | 0.9333333333333333 | 0.9385620915032679 |       0.9375       | 0.9374389051808407 |
| Random Forest | 0.9555555555555556 | 0.9629629629629629 | 0.9583333333333334 | 0.9581699346405229 |
+---------------+--------------------+--------------------+--------------------+--------------------+


#Task 2.
For breast cancer dataset (https://tinyurl.com/3vme8hr3) which could be loaded from datasets in sklearn as follows:

```
#Import scikit-learn dataset library
from sklearn import datasets

#Load dataset
cancer = datasets.load_breast_cancer()
```

*   Apply **GridSearchCV** to different classification algorithms such as **SVM, kNN, LogisticRegression, RandomForest**.
*   Compare the results obtained by the best hyperparameters among classification algorithms.

*   2.1. Apply **GridSearchCV** to **SVM**


In [None]:
# Import scikit-learn dataset library
from sklearn import datasets

# Load dataset
cancer = datasets.load_breast_cancer()

In [None]:
# code
x2  = cancer.data
y2 = cancer.target

In [None]:
X2_train, X2_test, y2_train, y2_test = train_test_split(x2,y2,test_size = 0.3)

In [None]:
grid2_1= GridSearchCV(SVC(), param_grid=param_grid,cv =10, refit = True)
grid2_1.fit(X2_train, y2_train)
print("best-param",grid2_1.best_params_)
print('best-estiamte',grid2_1.best_estimator_)
grid2_1predictions = grid2_1.predict(X2_test)

best-param {'C': 100, 'gamma': 1, 'kernel': 'linear'}
best-estiamte SVC(C=100, gamma=1, kernel='linear')


*   2.2. Apply **GridSearchCV** to **kNN**

In [None]:
#code
grid2_2 = GridSearchCV(KNeighborsClassifier(), grid_params,cv =10, refit = True)
grid2_2.fit(X2_train, y2_train)
print("best-param",grid2_2.best_params_)
print('best-estiamte',grid2_2.best_estimator_)
grid2_2predictions = grid2_2.predict(X2_test)

best-param {'metric': 'manhattan', 'n_neighbors': 7, 'weights': 'uniform'}
best-estiamte KNeighborsClassifier(metric='manhattan', n_neighbors=7)


*   2.3. Apply **GridSearchCV** to **LogisticRegression**

In [None]:
logreg_param_grid_new = {
    'C': [0.1, 1, 1, 10],  # Giảm giá trị C để giảm độ mạnh của điều chuẩn
    'penalty': ['l1', 'l2'],  # Vẫn sử dụng l1 và l2 penalty
    'solver': ['liblinear', 'saga'],  # Vẫn sử dụng 'liblinear' và 'saga'
    'max_iter': [100,200,300]  # Tăng số lượng vòng lặp
}

In [None]:
#code
grid2_3 = GridSearchCV(LogisticRegression(),logreg_param_grid_new ,cv =10, refit = True)
grid2_3.fit(X2_train, y2_train)
print("best-param",grid2_3.best_params_)
print('best-estiamte',grid2_3.best_estimator_)
grid2_3predictions = grid2_3.predict(X2_test)



best-param {'C': 10, 'max_iter': 100, 'penalty': 'l1', 'solver': 'liblinear'}
best-estiamte LogisticRegression(C=10, penalty='l1', solver='liblinear')


*   2.4. Apply **GridSearchCV** to **RandomForest**

In [None]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],  # Số lượng cây trong mô hình Random Forest
    'max_depth': [None, 10, 20],  # Độ sâu tối đa của mỗi cây
    'min_samples_split': [2, 5, 10],  # Số lượng mẫu tối thiểu để chia một nút trong cây
    'min_samples_leaf': [1, 2, 4],  # Số lượng mẫu tối thiểu trong mỗi lá cây
    'bootstrap': [True, False]  # Sử dụng Bootstrap cho mô hình hay không
}

In [None]:
#code
grid2_4 = GridSearchCV(RandomForestClassifier(), rf_param_grid,cv =10, refit = True)
grid2_4.fit(X2_train, y2_train)
print("best-param",grid2_4.best_params_)
print('best-estiamte',grid2_4.best_estimator_)
grid2_4predictions = grid2_4.predict(X2_test)
print(classification_report(y2_test, grid2_4predictions))

best-param {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 200}
best-estiamte RandomForestClassifier(min_samples_split=5, n_estimators=200)
              precision    recall  f1-score   support

           0       0.97      0.90      0.93        69
           1       0.93      0.98      0.96       102

    accuracy                           0.95       171
   macro avg       0.95      0.94      0.94       171
weighted avg       0.95      0.95      0.95       171



*   2.5. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
#code
best_svm = SVC(C=100, gamma=1, kernel='linear')
best_knn= KNeighborsClassifier(metric='manhattan', n_neighbors=7 )
best_lo = LogisticRegression(C=10, max_iter=1000, penalty='l1', solver='liblinear')
best_ran =  RandomForestClassifier(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=200)

algo = [best_svm, best_knn,best_lo,best_ran]
result=[]
for en in algo:
  en.fit(X2_train, y2_train)
  temp_pred=en.predict(X2_test)
  accuracy = metrics.accuracy_score(y2_test, temp_pred)
  precision = metrics.precision_score(y2_test, temp_pred)
  recall = metrics.recall_score(y2_test, temp_pred)
  f1 = metrics.f1_score(y2_test, temp_pred)
  result.append([accuracy,precision,recall,f1])



In [None]:
print(result)
print(result[0][0])

[[0.9766081871345029, 0.9622641509433962, 1.0, 0.9807692307692307], [0.9532163742690059, 0.9272727272727272, 1.0, 0.9622641509433962], [0.9707602339181286, 0.9532710280373832, 1.0, 0.9760765550239235], [0.9473684210526315, 0.9345794392523364, 0.9803921568627451, 0.9569377990430622]]
0.9766081871345029


In [None]:
table2_5  = PrettyTable(['Algo','accuracy', 'precision', 'recall', 'f1'])
table2_5.add_row(["svm",result[0][0],result[0][1],result[0][2],result[0][3]])
table2_5.add_row(["Knn",result[1][0],result[1][1],result[1][2],result[1][3]])
table2_5.add_row(["logistic",result[2][0],result[2][1],result[2][2],result[2][3]])
table2_5.add_row(["Random forest",result[3][0],result[3][1],result[3][2],result[3][3]])

print(table2_5)

+---------------+--------------------+--------------------+--------------------+--------------------+
|      Algo     |      accuracy      |     precision      |       recall       |         f1         |
+---------------+--------------------+--------------------+--------------------+--------------------+
|      svm      | 0.9766081871345029 | 0.9622641509433962 |        1.0         | 0.9807692307692307 |
|      Knn      | 0.9532163742690059 | 0.9272727272727272 |        1.0         | 0.9622641509433962 |
|    logistic   | 0.9707602339181286 | 0.9532710280373832 |        1.0         | 0.9760765550239235 |
| Random forest | 0.9473684210526315 | 0.9345794392523364 | 0.9803921568627451 | 0.9569377990430622 |
+---------------+--------------------+--------------------+--------------------+--------------------+


#Task 3. With **mobile price classification** dataset
* 3.1.  Apply **GridSearchCV** for **SVM, kNN, RandomForest** algorithms to find the best hyperparameters for each classification algorithm.
* 3.2. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
#load dataset mobile
from google.colab import drive
drive.mount('/content/gdrive')
%cd '/content/gdrive/MyDrive/machine_learning/lab5'
mobile_train= pd.read_csv('mobile_train.csv')
mobile_test = pd.read_csv('mobile_test.csv')
X_train3=mobile_train.iloc[:,:-1]
y_train3=mobile_train.iloc[:,-1]


X3_train, X3_test,y3_train, y3_test = train_test_split(X_train3,y_train3,test_size= 0.3)


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
/content/gdrive/MyDrive/machine_learning/lab5


In [None]:
#apply cho SVM
grid3_1= GridSearchCV(SVC(), param_grid=param_grid,cv =10, refit = True)
grid3_1.fit(X3_train, y3_train)
print("best-param",grid3_1.best_params_)
print('best-estiamte',grid3_1.best_estimator_)
grid2_1predictions = grid3_1.predict(X3_test)

KeyboardInterrupt: 

In [None]:
# apply cho kNN
grid3_2 = GridSearchCV(KNeighborsClassifier(), grid_params,cv =10, refit = True)
grid3_2.fit(X3_train, y3_train)
print("best-param",grid3_2.best_params_)
print('best-estiamte',grid3_2.best_estimator_)
grid2_2predictions = grid3_2.predict(X3_test)

best-param {'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}
best-estiamte KNeighborsClassifier(weights='distance')


In [None]:
# apply cho random forest
grid3_4 = GridSearchCV(RandomForestClassifier(), rf_param_grid,cv =10, refit = True)
grid3_4.fit(X3_train, y3_train)
print("best-param",grid3_4.best_params_)
print('best-estiamte',grid3_4.best_estimator_)
grid3_4predictions = grid3_4.predict(X3_test)
print(classification_report(y3_test, grid3_4predictions))

best-param {'bootstrap': False, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 300}
best-estiamte RandomForestClassifier(bootstrap=False, max_depth=10, min_samples_leaf=2,
                       min_samples_split=5, n_estimators=300)
              precision    recall  f1-score   support

           0       0.76      0.85      0.80        91
           1       0.98      0.93      0.96        92
           2       0.63      0.87      0.73        91
           3       0.86      0.86      0.86       105
           4       0.72      0.61      0.66        99
           5       0.91      0.87      0.89       105
           6       0.61      0.45      0.52        99
           7       0.88      0.86      0.87        94
           8       0.97      0.97      0.97       115
           9       0.88      0.94      0.91       109

    accuracy                           0.82      1000
   macro avg       0.82      0.82      0.82      1000
weighted avg       0.82      

In [None]:
#so sánh
best3_svm = SVC(C=0.1, gamma=1, kernel='linear')
best3_knn= KNeighborsClassifier(metric='minkowski', n_neighbors=5,weights='distance' )

algo_3 = [best3_svm, best3_knn]
result3=[]
for en1 in algo_3:
  en1.fit(X3_train, y3_train)
  temp3_pred=en1.predict(X3_test)
  accuracy = metrics.accuracy_score(y3_test, temp3_pred)
  precision = metrics.precision_score(y3_test, temp3_pred,average= 'micro')
  recall = metrics.recall_score(y3_test, temp3_pred, average='micro')
  f1 = metrics.f1_score(y3_test, temp3_pred, average='micro')
  result3.append([accuracy,precision,recall,f1])


In [None]:
table3_5  = PrettyTable(['Algo','accuracy', 'precision', 'recall', 'f1'])
table3_5.add_row(["svm",result[0][0],result[0][1],result[0][2],result[0][3]])
table3_5.add_row(["Knn",result[1][0],result[1][1],result[1][2],result[1][3]])


print(table3_5)

+------+--------------------+--------------------+--------+--------------------+
| Algo |      accuracy      |     precision      | recall |         f1         |
+------+--------------------+--------------------+--------+--------------------+
| svm  | 0.9766081871345029 | 0.9622641509433962 |  1.0   | 0.9807692307692307 |
| Knn  | 0.9532163742690059 | 0.9272727272727272 |  1.0   | 0.9622641509433962 |
+------+--------------------+--------------------+--------+--------------------+


#Task 4.
The dataset consists of **2000 user-created movie reviews** archived on the IMDb(Internet Movie Database). The reviews are equally partitioned into a positive set and a negative set (1000+1000). Each review consists of a plain text file (.txt) and a class label representing the overall user opinion.
The class attribute has only two values: **pos** (positive) or **neg** (negative).


*   4.1 Importing additional libraries

In [None]:
import nltk, random
nltk.download('movie_reviews')#download movie reviews dataset
from nltk.corpus import movie_reviews
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import cross_val_score
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


*   4.2. Movie reviews information

In [None]:
#code
print(len(movie_reviews.fileids()))
print(movie_reviews.categories())
print(movie_reviews.words()[:100])
print(movie_reviews.fileids()[:10])

2000
['neg', 'pos']
['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...]
['neg/cv000_29416.txt', 'neg/cv001_19502.txt', 'neg/cv002_17424.txt', 'neg/cv003_12683.txt', 'neg/cv004_12641.txt', 'neg/cv005_29357.txt', 'neg/cv006_17022.txt', 'neg/cv007_4992.txt', 'neg/cv008_29326.txt', 'neg/cv009_29417.txt']


*   4.3. Create dataset from movie reviews

In [None]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.seed(123)
random.shuffle(documents)

In [None]:
print('Number of Reviews/Documents: {}'.format(len(documents)))
print('Corpus Size (words): {}'.format(np.sum([len(d) for (d,l) in documents])))
print('Sample Text of Doc 1:')
print('-'*30)
print(' '.join(documents[0][0][:50])) # first 50 words of the first document

Number of Reviews/Documents: 2000
Corpus Size (words): 1583820
Sample Text of Doc 1:
------------------------------
most movies seem to release a third movie just so it can be called a trilogy . rocky iii seems to kind of fit in that category , but manages to be slightly unique . the rocky formula of " rocky loses fight / rocky trains / rocky wins fight


In [None]:
sentiment_distr = Counter([label for (words, label) in documents])
print(sentiment_distr)

Counter({'pos': 1000, 'neg': 1000})


*   4.4. Train test split

In [None]:
train, test= train_test_split(documents, test_size = 0.33, random_state=42)

In [None]:
## Sentiment Distrubtion for Train and Test
print(Counter([label for (words, label) in train]))
print(Counter([label for (words, label) in test]))

Counter({'neg': 674, 'pos': 666})
Counter({'pos': 334, 'neg': 326})


In [None]:
X_train = [' '.join(words) for (words, label) in train]
X_test = [' '.join(words) for (words, label) in test]
y_train = [label for (words, label) in train]
y_test = [label for (words, label) in test]

*   4.5. Text Vectorization

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2

tfidf_vec = TfidfVectorizer(min_df = 10, token_pattern = r'[a-zA-Z]+')
X_train_bow = tfidf_vec.fit_transform(X_train) # fit train
X_test_bow = tfidf_vec.transform(X_test) # transform test
#chọn ra dặc trưng
selector = SelectKBest(chi2, k=1000)
new_X_train_bow = selector.fit_transform(X_train_bow, y_train)
new_X_test_bow = selector.transform(X_test_bow)


*   4.6. Apply **SVM** with **GridSearchCV**

In [None]:
#code
grid4_1= GridSearchCV(SVC(), param_grid=param_grid,cv =10, refit = True)
grid4_1.fit(new_X_train_bow, y_train)
print("best-param",grid4_1.best_params_)
print('best-estiamte',grid4_1.best_estimator_)


best-param {'C': 10, 'gamma': 1, 'kernel': 'linear'}
best-estiamte SVC(C=10, gamma=1, kernel='linear')


ValueError: could not convert string to float: 'susan granger \' s review of " hearts in atlantis " ( castle rock / warner bros . ) timing is everything , and the timing just seems right for this poignant psychological drama that combines the coming - of - age nostalgia of " stand by me " with the mystical power of " the green mile . " based on stephen king stories adapted by screenwriter william goldman and directed by scott hicks , it \' s set in 1960 in harwich , connecticut , where fatherless 11 year - old bobby garfield ( anton yelchin ) lives in a boarding house with his resentful , self - centered mother ( hope davis ) . he \' s devoted to his neighborhood friends ( mika boorem , will rothhaar ) but his pivotal relationship is with a strange , new tenant , ted brautigan ( anthony hopkins ) who opens the world of literature to him after his selfish mother refuses to buy him a birthday gift and hands him , instead , a library card . knowing he \' s longing for a schwinn bike , ted offers to pay him $ 1 a week to read him the local newspaper and keep his eyes peeled for signs of the malevolent , ominous low men who are chasing him to exploit his special powers . the story structure consists of one long flashback , framed by the present , featuring the adult bobby ( david morse ) , minimizing the supernatural elements while emphasizing the human drama hopkins \' mysterious character is genteel , sensitive and benignly seductive . the only explanation is that he \' s a psychic who is wanted by the fbi to aid in their hunt for communists and even that seems appropriate , given the inexplicable state of the world right now . the production values and performances are solid , particularly hopkins - arguably the finest , most versatile actor of our era - and the children with whom he forges a firm bond . on the granger movie gauge of 1 to 10 , " hearts in atlantis " is a wistful , enigmatic 8 , evolving with subtle power to an emotionally effective catharsis .'

*   4.7. Apply **RandomForest** with **GridSearchCV**

In [None]:
#code
grid4_4 = GridSearchCV(RandomForestClassifier(), rf_param_grid,cv =10, refit = True)
grid4_4.fit(new_X_train_bow, y_train)
print("best-param",grid4_4.best_params_)
print('best-estiamte',grid4_4.best_estimator_)
grid4_4predictions = grid4_4.predict(X_test_bow)
print(classification_report(y_test, grid4_4predictions))

best-param {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
best-estiamte RandomForestClassifier(max_depth=20, n_estimators=300)


ValueError: X has 6138 features, but RandomForestClassifier is expecting 1000 features as input.

*   4.8. Apply **kNN** with **GridSearchCV**

In [None]:
#code
grid4_2 = GridSearchCV(KNeighborsClassifier(), grid_params,cv =10, refit = True)
grid4_2.fit(new_X_train_bow, y_train)
print("best-param",grid4_2.best_params_)
print('best-estiamte',grid4_2.best_estimator_)
grid4_2predictions = grid4_2.predict(X_test_bow)

best-param {'metric': 'minkowski', 'n_neighbors': 5, 'weights': 'distance'}
best-estiamte KNeighborsClassifier(weights='distance')


ValueError: X has 6138 features, but KNeighborsClassifier is expecting 1000 features as input.

*   4.9. Apply **LogisticRegression** with **GridSearchCV**

In [None]:
#code
grid4_3 = GridSearchCV(LogisticRegression(),logreg_param_grid_new ,cv =10, refit = True)
grid4_3.fit(new_X_train_bow, y_train)
print("best-param",grid4_3.best_params_)
print('best-estiamte',grid4_3.best_estimator_)
grid4_3predictions = grid4_3.predict(X_test_bow)



best-param {'C': 10, 'max_iter': 100, 'penalty': 'l2', 'solver': 'saga'}
best-estiamte LogisticRegression(C=10, solver='saga')




ValueError: X has 6138 features, but LogisticRegression is expecting 1000 features as input.

*   4.10. Compare the best obtained results among classification algorithms (use PrettyTable to dispaly the results)

In [None]:
best4_svm = SVC(C=10, gamma=1, kernel='linear')
best4_knn= KNeighborsClassifier(metric='minkowski', n_neighbors=5,weights='distance' )
best4_lo = LogisticRegression(C=10, max_iter=100, penalty='l2', solver='saga')
best4_ran =  RandomForestClassifier(max_depth=20,bootstrap=True, min_samples_leaf=1, min_samples_split=2, n_estimators=300)

algo4 = [best4_svm, best4_knn,best4_lo,best4_ran]
result4=[]
for en4 in algo4:
  en4.fit(new_X_train_bow, y_train)
  temp4_pred=en4.predict(new_X_test_bow)
  accuracy = metrics.accuracy_score(y_test, temp4_pred)
  precision = metrics.precision_score(y_test, temp4_pred, pos_label='neg')
  recall = metrics.recall_score(y_test, temp4_pred,pos_label='neg')
  f1 = metrics.f1_score(y_test, temp4_pred,pos_label='neg')
  result4.append([accuracy,precision,recall,f1])

In [None]:
table4_5  = PrettyTable(['Algo','accuracy', 'precision', 'recall', 'f1'])
table4_5.add_row(["svm",result[0][0],result[0][1],result[0][2],result[0][3]])
table4_5.add_row(["Knn",result[1][0],result[1][1],result[1][2],result[1][3]])
table4_5.add_row(["logistic",result[2][0],result[2][1],result[2][2],result[2][3]])
table4_5.add_row(["Random forest",result[3][0],result[3][1],result[3][2],result[3][3]])

print(table4_5)

+---------------+--------------------+--------------------+--------------------+--------------------+
|      Algo     |      accuracy      |     precision      |       recall       |         f1         |
+---------------+--------------------+--------------------+--------------------+--------------------+
|      svm      | 0.9766081871345029 | 0.9622641509433962 |        1.0         | 0.9807692307692307 |
|      Knn      | 0.9532163742690059 | 0.9272727272727272 |        1.0         | 0.9622641509433962 |
|    logistic   | 0.9707602339181286 | 0.9532710280373832 |        1.0         | 0.9760765550239235 |
| Random forest | 0.9473684210526315 | 0.9345794392523364 | 0.9803921568627451 | 0.9569377990430622 |
+---------------+--------------------+--------------------+--------------------+--------------------+


#Finally,
Save a copy in your Github. Remember renaming the notebook.