<a href="https://colab.research.google.com/github/jumbokh/nknu-class/blob/main/notebook/Topic_2_ML_params_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
!ln -fs /content/gdrive/My\ Drive /app

Mounted at /content/gdrive


In [2]:
!tar -xzvf /app/cuDNN/cudnn-10.0-linux-x64-v7.5.0.56.tgz -C /usr/local/
!chmod a+r /usr/local/cuda/include/cudnn.h

# 檢查是否安裝成功
!cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2

cuda/include/cudnn.h
cuda/NVIDIA_SLA_cuDNN_Support.txt
cuda/lib64/libcudnn.so
cuda/lib64/libcudnn.so.7
cuda/lib64/libcudnn.so.7.5.0
cuda/lib64/libcudnn_static.a
#define CUDNN_MAJOR 7
#define CUDNN_MINOR 5
#define CUDNN_PATCHLEVEL 0
--
#define CUDNN_VERSION (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)

#include "driver_types.h"


### [Data source](https://github.com/coding-maniacs/machine_learning_parameter_tuning/tree/master/data) 

In [3]:
import json as j
import pandas as pd
import numpy as np

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

In [5]:
json_data = None
with open('/app/data/yelp_academic_dataset_review.json') as data_file:
    lines = data_file.readlines()
    joined_lines = "[" + ",".join(lines) + "]"

    json_data = j.loads(joined_lines)

data = pd.DataFrame(json_data)
data.head()

Unnamed: 0,votes,user_id,review_id,stars,date,text,type,business_id
0,"{'funny': 0, 'useful': 5, 'cool': 2}",rLtl8ZkDX5vH5nAx9C3q5Q,fWKvX83p0-ka4JS3dc6E5A,5,2011-01-26,My wife took me here on my birthday for breakf...,review,9yKzy9PApeiPPOUJEtnvkg
1,"{'funny': 0, 'useful': 0, 'cool': 0}",0a2KyEL0d3Yb1V6aivbIuQ,IjZ33sJrzXqU-0X6U8NwyA,5,2011-07-27,I have no idea why some people give bad review...,review,ZRJwVLyzEJq1VAihDhYiow
2,"{'funny': 0, 'useful': 1, 'cool': 0}",0hT2KtfLiobPvh6cDC8JQg,IESLBzqUCLdSzSqm0eCSxQ,4,2012-06-14,love the gyro plate. Rice is so good and I als...,review,6oRAC4uyJCsJl1X0WZpVSA
3,"{'funny': 0, 'useful': 2, 'cool': 1}",uZetl9T0NcROGOyFfughhg,G-WvGaISbqqaMHlNnByodA,5,2010-05-27,"Rosie, Dakota, and I LOVE Chaparral Dog Park!!...",review,_1QQZuf4zZOyFCvXc0o6Vg
4,"{'funny': 0, 'useful': 0, 'cool': 0}",vYmM4KTsC8ZfQBg-j5MWkw,1uJFq2r5QfJG_6ExMRCaGw,5,2012-01-05,General Manager Scott Petello is a good egg!!!...,review,6ozycU1RpktNG2-1BroVtw


In [6]:
data = data[data.stars != 3]
data['sentiment'] = data['stars'] >= 4

X_train, X_test, y_train, y_test = train_test_split(data, data.sentiment, test_size=0.2)

In [7]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression())
])

In [8]:
from sklearn.model_selection import cross_val_score

In [9]:
scores = cross_val_score(pipeline, X_train.text, y_train, scoring='accuracy', cv=5, n_jobs=-1)

mean = scores.mean()
std = scores.std()
print(mean)
print(std)

print(pipeline.get_params())

0.9404054357952903
0.00156373658254123
{'memory': None, 'steps': [('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)), ('tfidf', TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)), ('classifier', LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False))], 'verbose': False, 'vectorizer': CountVectorizer(analyzer='word', bi

In [10]:
from sklearn.model_selection import GridSearchCV
grid = {
    'vectorizer__ngram_range': [(1, 1), (2, 1)],
    'vectorizer__stop_words': [None, 'english'],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__C': [1.0, 0.8],
    'classifier__class_weight': [None, 'balanced'],
    'classifier__n_jobs': [-1]
}

In [11]:
grid_search = GridSearchCV(pipeline, param_grid=grid, scoring='accuracy', n_jobs=-1, cv=5)
grid_search.fit(X=X_train.text, y=y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('vectorizer',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                      

In [12]:
print("-----------")
print(grid_search.best_score_)
print(grid_search.best_params_)

-----------
0.9404054357952903
{'classifier__C': 1.0, 'classifier__class_weight': None, 'classifier__n_jobs': -1, 'classifier__penalty': 'l2', 'vectorizer__ngram_range': (1, 1), 'vectorizer__stop_words': None}


In [17]:
model = pipeline.fit(X_train.text, y_train)
predicted = model.predict(X_test.text)
print("model1: " + str(np.mean(predicted == y_test)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


model1: 0.9419671541288648


In [22]:
pipeline2 = Pipeline([
    ('vectorizer', CountVectorizer(min_df=1)),
    ('tfidf', TfidfTransformer()),
    ('classifier', LogisticRegression(C=1.0, penalty='l1', solver='liblinear',class_weight=None, n_jobs=-1))
    #('classifier', LogisticRegression(C=1.0, class_weight=None, n_jobs=-1, penalty='l1'))
])

In [23]:
model2 = pipeline2.fit(X_train.text, y_train)
predicted2 = model2.predict(X_test.text)
print("model2: " + str(np.mean(predicted2 == y_test)))

  " = {}.".format(effective_n_jobs(self.n_jobs)))


model2: 0.9431751008764039
