# Sentiment Analysis pada Komentar TikTok Bank BCA Menggunakan Machine Learning

Proyek ini bertujuan untuk menganalisis sentimen komentar yang ada di TikTok terkait Bank BCA. Data komentar didapatkan melalui teknik web scraping, yang memungkinkan pengambilan data secara otomatis dari platform TikTok.

Setelah data terkumpul, saya menerapkan machine learning untuk melakukan sentiment analysis, yang bertujuan untuk mengklasifikasikan komentar menjadi sentimen positif, negatif, atau netral. Model ini dibangun dengan menggunakan berbagai teknik pra-pemrosesan teks serta model pembelajaran mesin untuk mencapai hasil yang akurat dalam mengkategorikan opini pengguna.

Proyek ini memberikan wawasan yang bermanfaat tentang bagaimana publik merespons layanan Bank BCA di media sosial, yang bisa digunakan untuk meningkatkan strategi bisnis dan layanan pelanggan.








# IMPORT LIBRARY

In [None]:
!pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [None]:
import requests
import pandas as pd
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk import word_tokenize
from nltk.corpus import stopwords
import re
import warnings
import plotly.figure_factory as ff


warnings.filterwarnings("ignore")

import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# LABELING
ini labeling dengan manual dan juga melakukan penyeimbangan banyaknya class

# PRE - PROCESSING




In [None]:
data = pd.read_excel("tiktok_sentimenn.xlsx")
data.value_counts("complaint")

Unnamed: 0_level_0,count
complaint,Unnamed: 1_level_1
0.0,1627
1.0,1238


In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2867 entries, 0 to 2866
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   comments   2867 non-null   object 
 1   complaint  2865 non-null   float64
dtypes: float64(1), object(1)
memory usage: 44.9+ KB


In [None]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"
        "\U0001F300-\U0001F5FF"
        "\U0001F680-\U0001F6FF"
        "\U0001F700-\U0001F77F"
        "\U0001F780-\U0001F7FF"
        "\U0001F800-\U0001F8FF"
        "\U0001F900-\U0001F9FF"
        "\U0001F1E0-\U0001F1FF"
        "\u2600-\u26FF"
        "\u2700-\u27BF"
        "]+"
    )
    return emoji_pattern.sub(r'', text)

def remove_numbers(text):

    return re.sub(r'\d+', '', text)


def preprocessing(comments) :

   factory = StemmerFactory()
   stemmer = factory.create_stemmer()

   comments = re.sub(r'@\w+', '', comments)

   comment = remove_emojis(text = comments)
   no_number = remove_numbers(comment)

   tokens = word_tokenize(no_number)

   lower = [token.lower() for token in tokens]



   stop_words = set(stopwords.words('indonesian'))

   stopword = [word for word in lower if word not in stop_words]

   stemm = [stemmer.stem(word) for word in stopword]

   return " ".join(stemm)

data['after'] = data.comments.apply(preprocessing)

In [None]:
dataset = data[['after' , 'complaint']]
dataset

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0
...,...,...
2862,solusi cs kartu kredit,1.0
2863,proses naik limit kartu kredit,1.0
2864,sulit cashback guna kartu kredit,1.0
2865,pemberitahuan promo kartu kredit,1.0


In [None]:
dataset = dataset[dataset.after != ""]
dataset

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0
...,...,...
2862,solusi cs kartu kredit,1.0
2863,proses naik limit kartu kredit,1.0
2864,sulit cashback guna kartu kredit,1.0
2865,pemberitahuan promo kartu kredit,1.0


In [None]:
dataset.isna().sum()

Unnamed: 0,0
after,0
complaint,2


In [None]:
dataset.dropna(inplace=True)

In [None]:
dataset.isna().sum()

Unnamed: 0,0
after,0
complaint,0


In [None]:
dataset.reset_index(drop=True , inplace=True)

In [None]:
dataset.to_csv("modelling_data_final.csv" , index=False)

# MODELLING


In [None]:
dataset = pd.read_csv("modelling_data_final.csv")
dataset.head()

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0


In [None]:
dataset.complaint.value_counts()

Unnamed: 0_level_0,count
complaint,Unnamed: 1_level_1
0.0,1481
1.0,1237


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2718 entries, 0 to 2717
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   after      2718 non-null   object 
 1   complaint  2718 non-null   float64
dtypes: float64(1), object(1)
memory usage: 42.6+ KB


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
from sklearn.linear_model import LogisticRegression ,  PassiveAggressiveClassifier, Perceptron, SGDClassifier, RidgeClassifier , RidgeClassifierCV
from sklearn.neighbors import KNeighborsClassifier , NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB , MultinomialNB , BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier , ExtraTreesClassifier , GradientBoostingClassifier , RandomForestClassifier , BaggingClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis,QuadraticDiscriminantAnalysis
from sklearn.semi_supervised import LabelPropagation, LabelSpreading
from sklearn.calibration import CalibratedClassifierCV
from sklearn.dummy import DummyClassifier



from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer
from sklearn.metrics import f1_score , roc_auc_score , recall_score , precision_score , accuracy_score , balanced_accuracy_score , classification_report , confusion_matrix

In [None]:
Count_Data = CountVectorizer(ngram_range=(1,2)).fit(dataset.after)
X_Count_matrix = Count_Data.transform(dataset.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())
X_Count

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yummy drool,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2716,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Tfid_Data = TfidfVectorizer(ngram_range=(1,2)).fit(dataset.after)
X_Tfid_Matrix = Tfid_Data.transform(dataset.after)

X_Tfid = pd.DataFrame(data=X_Tfid_Matrix.toarray() , columns = Tfid_Data.get_feature_names_out())
X_Tfid

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yummy drool,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
Count_df = pd.concat([X_Count , dataset.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0


In [None]:
Tfid_df = pd.concat([X_Tfid , dataset.complaint] , axis=1)
Tfid_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,complaint
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
MODELS = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Gaussian Naive Bayes": GaussianNB(),
    "Support Vector Classifier": SVC(),
    "AdaBoost Classifier": AdaBoostClassifier(),
    "Linear Discriminant Analysis": LinearDiscriminantAnalysis(),
    "Extra Trees Classifier": ExtraTreesClassifier(),
    "Gradient Boosting Classifier": GradientBoostingClassifier(),
    "Random Forest Classifier": RandomForestClassifier(),
    "Bagging Classifier": BaggingClassifier(),
    "LightGBM Classifier": LGBMClassifier(),
    "CatBoost Classifier": CatBoostClassifier(),
    "XGBoost Classifier": XGBClassifier(),
    "Quadratic Discriminant Analysis": QuadraticDiscriminantAnalysis(),
    "Nearest Centroid": NearestCentroid(),
    "Label Propagation": LabelPropagation(),
    "Label Spreading": LabelSpreading(),
    "Passive Aggressive Classifier": PassiveAggressiveClassifier(),
    "Perceptron": Perceptron(),
    "SGD Classifier": SGDClassifier(),
    "Bernoulli Naive Bayes": BernoulliNB(),
    "Calibrated Classifier CV": CalibratedClassifierCV(),
    "Ridge Classifier": RidgeClassifier(),
    "Dummy Classifier": DummyClassifier(),
    "Ridge Classifier CV": RidgeClassifierCV(),
}

In [None]:
def style_dataframe(df) :
  df.sort_values(["Accuracy" , "F1-Score" , "ROC-AUC"] , inplace=True , ascending=False)
  df_style = df.style.background_gradient(cmap="Blues" , subset =["Accuracy"]).background_gradient(cmap="Reds" , subset =["F1-Score"]).background_gradient(cmap="Greens" , subset =["ROC-AUC"])
  return df_style


def classifier_report(X_train , X_test , y_train , y_test , print=False) :

  accuracy_score_list = []
  f1_score_list = []
  recall_list = []
  precision_score_list = []
  balance_score_list = []
  Model_name_list = []
  roc_auc_score_list = []

  for name_model , model in MODELS.items() :

    model_now = model
    model_now.fit(X_train , y_train)
    y_pred = model.predict(X_test)

    accuracy =  np.round(accuracy_score(y_test , y_pred) , 2)
    f1 = np.round(f1_score(y_test , y_pred), 2)
    recall = np.round(recall_score(y_test , y_pred),2)
    precision = np.round(precision_score(y_test , y_pred),2)
    balance_score = np.round(balanced_accuracy_score(y_test , y_pred),2)

    try :
      y_pred_proba = model_now.predict_proba(X_test)[:, 1]
      roc_auc = np.round(roc_auc_score(y_test, y_pred_proba),2)
    except :
      roc_auc = None


    if print :

      print("==============================================")

      print(f"Name Model     : {name_model}" )
      print(f"Model          : {model}" )
      print(f"Accuracy Score : {accuracy}")
      print(f"F1 Score       : {f1}")
      print(f"Recall         : {recall}")
      print(f"precision      : {precision}")

      print("==============================================")
      print("\n")

    Model_name_list.append(name_model)
    accuracy_score_list.append(accuracy)
    f1_score_list.append(f1)
    recall_list.append(recall)
    precision_score_list.append(precision)
    balance_score_list.append(balance_score)
    roc_auc_score_list.append(roc_auc)


  report = pd.DataFrame({"Model" :Model_name_list , "Accuracy"  :accuracy_score_list , "F1-Score" :f1_score_list , "Recall" : recall_list , "Precision" : precision_score_list , "Balance" : balance_score_list , "ROC-AUC" : roc_auc_score_list  } )
  return style_dataframe(report.set_index("Model"))


## Case 1 ( tanpa sampling )


countvectorize

In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test , print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 374
[LightGBM] [Info] Number of data points in the train set: 2038, number of used features: 131
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457311 -> initscore=-0.171172
[LightGBM] [Info] Start training from score -0.171172
Learning rate set to 0.013963
0:	learn: 0.6890770	total: 94.2ms	remaining: 1m 34s
1:	learn: 0.6840820	total: 130ms	remaining: 1m 5s
2:	learn: 0.6808104	total: 166ms	remaining: 55.1s
3:	learn: 0.6773742	total: 203ms	remaining: 50.5s
4:	learn: 0.6739955	total: 238ms	remaining: 47.4s
5:	learn: 0.6705172	total: 273ms	remaining: 45.2s
6:	learn: 0.6675213	total: 310ms	remaining: 44s
7:	learn: 0.6642431	total: 346ms	remaining: 42.9s
8:	learn: 0.6602

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.88,0.85,0.92,0.89,0.96
Random Forest Classifier,0.9,0.88,0.85,0.92,0.89,0.96
Calibrated Classifier CV,0.9,0.88,0.86,0.91,0.89,0.95
Ridge Classifier,0.89,0.88,0.85,0.91,0.89,
Ridge Classifier CV,0.89,0.88,0.85,0.91,0.89,
Logistic Regression,0.89,0.87,0.84,0.91,0.89,0.95
Passive Aggressive Classifier,0.89,0.87,0.86,0.89,0.88,
SGD Classifier,0.89,0.87,0.84,0.9,0.88,
Perceptron,0.88,0.86,0.81,0.92,0.88,
Bernoulli Naive Bayes,0.88,0.85,0.77,0.94,0.87,0.94


TdifVectorizer

In [None]:
X = Tfid_df.drop(["complaint"] , axis=1)
y = Tfid_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test , print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2206
[LightGBM] [Info] Number of data points in the train set: 2038, number of used features: 131
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457311 -> initscore=-0.171172
[LightGBM] [Info] Start training from score -0.171172
Learning rate set to 0.013963
0:	learn: 0.6875735	total: 105ms	remaining: 1m 45s
1:	learn: 0.6841169	total: 177ms	remaining: 1m 28s
2:	learn: 0.6806207	total: 231ms	remaining: 1m 16s
3:	learn: 0.6766198	total: 288ms	remaining: 1m 11s
4:	learn: 0.6723656	total: 363ms	remaining: 1m 12s
5:	learn: 0.6683701	total: 434ms	remaining: 1m 11s
6:	learn: 0.6641506	total: 497ms	remaining: 1m 10s
7:	learn: 0.6604793	total: 559ms	remaining: 1m 9s
8:	learn

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SGD Classifier,0.9,0.89,0.86,0.91,0.9,
Ridge Classifier CV,0.9,0.88,0.83,0.94,0.89,
Extra Trees Classifier,0.89,0.87,0.82,0.94,0.89,0.96
Calibrated Classifier CV,0.89,0.87,0.84,0.91,0.89,0.96
Ridge Classifier,0.89,0.87,0.81,0.94,0.88,
Random Forest Classifier,0.89,0.86,0.8,0.94,0.88,0.95
Passive Aggressive Classifier,0.88,0.86,0.83,0.9,0.88,
Perceptron,0.88,0.86,0.78,0.95,0.87,
Bernoulli Naive Bayes,0.88,0.85,0.77,0.94,0.87,0.94
Logistic Regression,0.87,0.84,0.75,0.95,0.86,0.95


## CASE 2 ( over sampling )

CountVectorize + oversampling

In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
y_train_over.value_counts('complaint')

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test , print=False )
model

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 386
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 136
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01446
0:	learn: 0.6890218	total: 35.4ms	remaining: 35.3s
1:	learn: 0.6838694	total: 71.8ms	remaining: 35.8s
2:	learn: 0.6807524	total: 111ms	remaining: 36.9s
3:	learn: 0.6767736	total: 147ms	remaining: 36.6s
4:	learn: 0.6738394	total: 182ms	remaining: 36.1s
5:	learn: 0.6703389	total: 217ms	remaining: 35.9s
6:	learn: 0.6673316	total: 252ms	remaining: 35.7s
7:	learn: 0.6640436	total: 288ms	remaining: 35.7s
8:	learn: 0.6598215	total: 326ms	remaining: 35.9s
9:	learn: 0.6569292

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ridge Classifier,0.89,0.88,0.87,0.88,0.89,
Ridge Classifier CV,0.89,0.88,0.85,0.91,0.89,
Logistic Regression,0.89,0.87,0.85,0.89,0.88,0.94
Bernoulli Naive Bayes,0.88,0.87,0.87,0.87,0.88,0.93
Calibrated Classifier CV,0.88,0.86,0.84,0.89,0.88,0.94
Perceptron,0.87,0.85,0.84,0.87,0.87,
SGD Classifier,0.87,0.85,0.85,0.86,0.87,
Support Vector Classifier,0.87,0.84,0.76,0.95,0.86,
Passive Aggressive Classifier,0.86,0.85,0.9,0.81,0.86,
XGBoost Classifier,0.86,0.83,0.8,0.87,0.85,0.92


TfidVectorize + oversampling

In [None]:
X = Tfid_df.drop(["complaint"] , axis=1)
y = Tfid_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test , print=False )
model

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2726
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 163
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01446
0:	learn: 0.6895733	total: 52.3ms	remaining: 52.3s
1:	learn: 0.6846159	total: 109ms	remaining: 54.3s
2:	learn: 0.6794110	total: 163ms	remaining: 54s
3:	learn: 0.6760405	total: 217ms	remaining: 54s
4:	learn: 0.6717644	total: 278ms	remaining: 55.3s
5:	learn: 0.6672831	total: 335ms	remaining: 55.5s
6:	learn: 0.6631919	total: 388ms	remaining: 55.1s
7:	learn: 0.6598421	total: 445ms	remaining: 55.1s
8:	learn: 0.6558761	total: 503ms	remaining: 55.3s
9:	learn: 0.6517652	tot

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.88,0.83,0.95,0.89,0.96
Calibrated Classifier CV,0.9,0.88,0.84,0.92,0.89,0.96
SGD Classifier,0.9,0.88,0.86,0.91,0.89,
Ridge Classifier CV,0.9,0.88,0.84,0.94,0.9,
Bernoulli Naive Bayes,0.89,0.87,0.81,0.94,0.88,0.94
Passive Aggressive Classifier,0.89,0.87,0.83,0.91,0.88,
Ridge Classifier,0.89,0.87,0.82,0.93,0.89,
Random Forest Classifier,0.88,0.86,0.8,0.93,0.88,0.96
Perceptron,0.88,0.86,0.77,0.96,0.87,
Support Vector Classifier,0.88,0.84,0.74,0.97,0.86,


## CASE 3 (Downsampling)

In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1864

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test , print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001899 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 337
[LightGBM] [Info] Number of data points in the train set: 1864, number of used features: 118
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01344
0:	learn: 0.6892392	total: 32.8ms	remaining: 32.8s
1:	learn: 0.6851788	total: 65.9ms	remaining: 32.9s
2:	learn: 0.6821182	total: 98.7ms	remaining: 32.8s
3:	learn: 0.6789539	total: 142ms	remaining: 35.2s
4:	learn: 0.6757651	total: 241ms	remaining: 47.9s
5:	learn: 0.6724855	total: 329ms	remaining: 54.5s
6:	learn: 0.6696643	total: 426ms	remaining: 1m
7:	learn: 0.6670750	total: 507ms	remaining: 1m 2s
8:	learn: 0.6639297	total: 587ms	remaining: 1m 4s
9:	learn: 0.6609831	tot

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.88,0.86,0.9,0.89,0.96
Bernoulli Naive Bayes,0.89,0.88,0.85,0.91,0.89,0.93
Passive Aggressive Classifier,0.89,0.88,0.86,0.9,0.89,
Random Forest Classifier,0.89,0.87,0.84,0.91,0.88,0.96
Logistic Regression,0.89,0.87,0.85,0.89,0.88,0.95
Calibrated Classifier CV,0.89,0.87,0.86,0.89,0.89,0.95
Support Vector Classifier,0.89,0.87,0.82,0.93,0.88,
Ridge Classifier,0.89,0.87,0.85,0.9,0.89,
Ridge Classifier CV,0.89,0.87,0.85,0.9,0.89,
Perceptron,0.88,0.87,0.89,0.85,0.88,


TfidVectorize + UnderSampling

In [None]:
X = Tfid_df.drop(["complaint"] , axis=1)
y = Tfid_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1864

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test , print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001966 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1967
[LightGBM] [Info] Number of data points in the train set: 1864, number of used features: 121
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01344
0:	learn: 0.6886825	total: 43.9ms	remaining: 43.9s
1:	learn: 0.6848756	total: 80.8ms	remaining: 40.3s
2:	learn: 0.6800554	total: 120ms	remaining: 39.8s
3:	learn: 0.6765489	total: 156ms	remaining: 38.8s
4:	learn: 0.6732628	total: 193ms	remaining: 38.3s
5:	learn: 0.6698333	total: 232ms	remaining: 38.4s
6:	learn: 0.6663721	total: 268ms	remaining: 38.1s
7:	learn: 0.6631862	total: 305ms	remaining: 37.8s
8:	learn: 0.6602883	total: 343ms	remaining: 37.8s
9:	learn: 0.6564034	

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ridge Classifier CV,0.9,0.89,0.86,0.92,0.9,
Extra Trees Classifier,0.9,0.88,0.83,0.94,0.89,0.96
Bernoulli Naive Bayes,0.9,0.88,0.85,0.92,0.89,0.93
SGD Classifier,0.9,0.88,0.83,0.94,0.9,
Ridge Classifier,0.9,0.88,0.84,0.92,0.89,
Calibrated Classifier CV,0.89,0.88,0.86,0.9,0.89,0.96
Random Forest Classifier,0.89,0.87,0.81,0.95,0.89,0.95
Passive Aggressive Classifier,0.89,0.87,0.84,0.91,0.88,
Support Vector Classifier,0.89,0.86,0.78,0.96,0.88,
Perceptron,0.89,0.86,0.78,0.96,0.88,


## Case 4 ( menambahkan fitur len text )

CountVectorize

In [None]:
data_len = pd.read_csv("modelling_data_final.csv")
data_len.head()

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0


In [None]:
data_len["text_len"] = data_len['after'].apply(lambda x: len(x.split()))
data_len.head()

Unnamed: 0,after,complaint,text_len
0,mbanking lot transaksi lelet banget lokasi nyala,1.0,7
1,kartu kadaluwarsa urus semua cabang bca,0.0,6
2,bca mobile merah y,1.0,4
3,bca mobile merah y,1.0,4
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0,12


In [None]:
Count_Data = CountVectorizer(ngram_range=(1,2)).fit(data_len.after)
X_Count_matrix = Count_Data.transform(data_len.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())

In [None]:
Count_df = pd.concat([X_Count , data_len.text_len ,data_len.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,text_len,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,1.0


In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 1106
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002026 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 400
[LightGBM] [Info] Number of data points in the train set: 2038, number of used features: 132
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457311 -> initscore=-0.171172
[LightGBM] [Info] Start training from score -0.171172
Learning rate set to 0.013963
0:	learn: 0.6870022	total: 36.8ms	remaining: 36.8s
1:	learn: 0.6825351	total: 73.2ms	remaining: 36.5s
2:	learn: 0.6786399	total: 114ms	remaining: 37.8s
3:	learn: 0.6743781	total: 150ms	remaining: 37.5s
4:	learn: 0.6695243	total: 186ms	remaining: 37s
5:	learn: 0.6664863	total: 223ms	remaining: 37s
6:	learn: 0.6631023	total: 259ms	remaining: 36.8s
7:	learn: 0.6596130	total: 297ms	remaining: 36.8s
8:	learn: 0.6553356	total: 352ms	remaining: 38.7s
9:	learn: 0.6518667	total: 388ms

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.88,0.85,0.92,0.89,0.96
Random Forest Classifier,0.89,0.87,0.82,0.93,0.88,0.96
Logistic Regression,0.89,0.87,0.85,0.9,0.89,0.95
Calibrated Classifier CV,0.89,0.87,0.85,0.9,0.89,0.95
Passive Aggressive Classifier,0.89,0.87,0.87,0.88,0.88,
Ridge Classifier,0.89,0.87,0.86,0.89,0.89,
Ridge Classifier CV,0.89,0.87,0.86,0.89,0.89,
Bagging Classifier,0.88,0.86,0.82,0.89,0.87,0.93
Perceptron,0.88,0.86,0.85,0.88,0.88,
SGD Classifier,0.88,0.86,0.81,0.91,0.87,


CountVectorize + OverSampling

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 409
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 136
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01446
0:	learn: 0.6869348	total: 36ms	remaining: 35.9s
1:	learn: 0.6823042	total: 71.5ms	remaining: 35.7s
2:	learn: 0.6780151	total: 111ms	remaining: 36.8s
3:	learn: 0.6736540	total: 147ms	remaining: 36.5s
4:	learn: 0.6687435	total: 184ms	remaining: 36.6s
5:	learn: 0.6639770	total: 220ms	remaining: 36.5s
6:	learn: 0.6607162	total: 265ms	remaining: 37.6s
7:	learn: 0.6572228	total: 300ms	remaining: 37.2s
8:	learn: 0.6530093	total: 341ms	remaining: 37.6s
9:	learn: 0.6495363	t

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Logistic Regression,0.89,0.88,0.89,0.87,0.89,0.94
Bernoulli Naive Bayes,0.88,0.87,0.87,0.87,0.88,0.93
Calibrated Classifier CV,0.88,0.87,0.88,0.86,0.88,0.93
Ridge Classifier CV,0.88,0.87,0.86,0.88,0.88,
Ridge Classifier,0.88,0.86,0.88,0.85,0.88,
CatBoost Classifier,0.87,0.85,0.82,0.88,0.87,0.93
Passive Aggressive Classifier,0.86,0.85,0.9,0.81,0.87,
XGBoost Classifier,0.86,0.84,0.82,0.86,0.85,0.93
Support Vector Classifier,0.85,0.81,0.72,0.93,0.84,
LightGBM Classifier,0.84,0.82,0.81,0.83,0.84,0.92


CountVectorize + UnderSampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1864

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001950 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 372
[LightGBM] [Info] Number of data points in the train set: 1864, number of used features: 122
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01344
0:	learn: 0.6895000	total: 33.8ms	remaining: 33.7s
1:	learn: 0.6851817	total: 66.9ms	remaining: 33.4s
2:	learn: 0.6811511	total: 104ms	remaining: 34.4s
3:	learn: 0.6772547	total: 136ms	remaining: 33.8s
4:	learn: 0.6735321	total: 171ms	remaining: 34.1s
5:	learn: 0.6695664	total: 204ms	remaining: 33.8s
6:	learn: 0.6656710	total: 237ms	remaining: 33.6s
7:	learn: 0.6609420	total: 274ms	remaining: 33.9s
8:	learn: 0.6576864	total: 313ms	remaining: 34.5s
9:	learn: 0.6537318	t

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.89,0.86,0.92,0.9,0.96
Random Forest Classifier,0.89,0.88,0.83,0.93,0.89,0.96
Bernoulli Naive Bayes,0.89,0.88,0.84,0.91,0.89,0.94
Ridge Classifier,0.89,0.88,0.87,0.89,0.89,
Ridge Classifier CV,0.89,0.88,0.87,0.89,0.89,
Logistic Regression,0.89,0.87,0.87,0.88,0.89,0.95
Calibrated Classifier CV,0.88,0.87,0.87,0.87,0.88,0.95
Passive Aggressive Classifier,0.88,0.87,0.87,0.86,0.88,
Perceptron,0.88,0.85,0.78,0.94,0.87,
CatBoost Classifier,0.86,0.84,0.81,0.88,0.86,0.94


In [None]:
Tfid_Data = TfidfVectorizer(ngram_range=(1,2)).fit(data_len.after)
X_Tfid_Matrix = Tfid_Data.transform(data_len.after)

X_Tfid = pd.DataFrame(data=X_Tfid_Matrix.toarray() , columns = Tfid_Data.get_feature_names_out())

In [None]:
Tfid_df = pd.concat([X_Tfid , data_len.text_len,data_len.complaint] , axis=1)
Tfid_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,text_len,complaint
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,1.0


TfidVectorize

In [None]:
X = Tfid_df.drop(["complaint"] , axis=1)
y = Tfid_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2232
[LightGBM] [Info] Number of data points in the train set: 2038, number of used features: 132
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457311 -> initscore=-0.171172
[LightGBM] [Info] Start training from score -0.171172
Learning rate set to 0.013963
0:	learn: 0.6873439	total: 96.7ms	remaining: 1m 36s
1:	learn: 0.6827439	total: 174ms	remaining: 1m 26s
2:	learn: 0.6786519	total: 249ms	remaining: 1m 22s
3:	learn: 0.6742601	total: 297ms	remaining: 1m 13s
4:	learn: 0.6687412	total: 341ms	remaining: 1m 7s
5:	learn: 0.6658709	total: 383ms	remaining: 1m 3s
6:	learn: 0.6625156	total: 423ms	remaining: 1m
7:	learn: 0.6588331	total: 464ms	remaining: 57.5s
8:	learn: 0.6

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Calibrated Classifier CV,0.9,0.89,0.86,0.91,0.9,0.96
Extra Trees Classifier,0.9,0.88,0.82,0.95,0.89,0.96
Ridge Classifier CV,0.9,0.88,0.85,0.92,0.89,
Ridge Classifier,0.89,0.88,0.83,0.92,0.89,
Random Forest Classifier,0.88,0.86,0.78,0.95,0.87,0.96
Logistic Regression,0.88,0.86,0.79,0.93,0.87,0.95
Perceptron,0.88,0.86,0.81,0.91,0.87,
Bernoulli Naive Bayes,0.88,0.85,0.77,0.94,0.87,0.94
XGBoost Classifier,0.87,0.85,0.8,0.91,0.87,0.93
CatBoost Classifier,0.86,0.84,0.78,0.9,0.86,0.94


TfidVectorize + OverSampling

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002943 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2882
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 167
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01446
0:	learn: 0.6874779	total: 54.7ms	remaining: 54.6s
1:	learn: 0.6828657	total: 114ms	remaining: 56.9s
2:	learn: 0.6786343	total: 169ms	remaining: 56.1s
3:	learn: 0.6741909	total: 224ms	remaining: 55.9s
4:	learn: 0.6692254	total: 280ms	remaining: 55.6s
5:	learn: 0.6652472	total: 352ms	remaining: 58.3s
6:	learn: 0.6620993	total: 410ms	remaining: 58.1s
7:	learn: 0.6572269	total: 466ms	remaining: 57.8s
8:	learn: 0.6538344	total: 522ms	remaining: 57.5s
9:	learn: 0.6506885

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.88,0.82,0.95,0.89,0.96
Calibrated Classifier CV,0.9,0.88,0.85,0.92,0.89,0.96
Ridge Classifier,0.9,0.88,0.85,0.92,0.9,
Ridge Classifier CV,0.9,0.88,0.85,0.92,0.89,
Passive Aggressive Classifier,0.89,0.88,0.87,0.88,0.89,
Bernoulli Naive Bayes,0.89,0.86,0.79,0.95,0.88,0.94
Random Forest Classifier,0.88,0.86,0.79,0.93,0.87,0.96
Logistic Regression,0.88,0.86,0.83,0.9,0.88,0.95
CatBoost Classifier,0.87,0.85,0.82,0.88,0.86,0.94
XGBoost Classifier,0.86,0.84,0.81,0.88,0.86,0.94


TfidVectorize + DownSampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1864

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1971
[LightGBM] [Info] Number of data points in the train set: 1864, number of used features: 118
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01344
0:	learn: 0.6886884	total: 48.2ms	remaining: 48.2s
1:	learn: 0.6827298	total: 86.1ms	remaining: 43s
2:	learn: 0.6774410	total: 127ms	remaining: 42.3s
3:	learn: 0.6736723	total: 165ms	remaining: 41s
4:	learn: 0.6692447	total: 204ms	remaining: 40.6s
5:	learn: 0.6657129	total: 244ms	remaining: 40.3s
6:	learn: 0.6613839	total: 281ms	remaining: 39.9s
7:	learn: 0.6584555	total: 318ms	remaining: 39.4s
8:	learn: 0.6539065	total: 369ms	remaining: 40.6s
9:	learn: 0.6485804	tota

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.89,0.84,0.93,0.9,0.96
Ridge Classifier CV,0.9,0.89,0.87,0.91,0.9,
Calibrated Classifier CV,0.89,0.88,0.87,0.89,0.89,0.96
Bernoulli Naive Bayes,0.89,0.88,0.85,0.91,0.89,0.94
Ridge Classifier,0.89,0.88,0.86,0.9,0.89,
Perceptron,0.88,0.87,0.89,0.85,0.88,
Random Forest Classifier,0.88,0.86,0.79,0.93,0.87,0.96
Logistic Regression,0.87,0.85,0.82,0.88,0.87,0.95
CatBoost Classifier,0.87,0.85,0.82,0.88,0.86,0.94
XGBoost Classifier,0.87,0.85,0.83,0.88,0.87,0.93


## Case 5 ( ngrams 1 , 3 )

CountVectorize

In [None]:
Count_Data = CountVectorizer(ngram_range=(1,3)).fit(dataset.after)
X_Count_matrix = Count_Data.transform(dataset.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())
X_Count

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai cabang bank,abai hubung,abai hubung cs,...,zaman,zaman gw,zaman gw kuliah,zaman pake,zaman pake kartu,zharif,zte,zte bca,zte bca mobile,zuu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2716,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Count_df = pd.concat([X_Count , dataset.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai cabang bank,abai hubung,abai hubung cs,...,zaman gw,zaman gw kuliah,zaman pake,zaman pake kartu,zharif,zte,zte bca,zte bca mobile,zuu,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0


In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002327 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 374
[LightGBM] [Info] Number of data points in the train set: 2038, number of used features: 131
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457311 -> initscore=-0.171172
[LightGBM] [Info] Start training from score -0.171172
Learning rate set to 0.013963
0:	learn: 0.6897157	total: 66ms	remaining: 1m 5s
1:	learn: 0.6863743	total: 128ms	remaining: 1m 4s
2:	learn: 0.6833177	total: 203ms	remaining: 1m 7s
3:	learn: 0.6801539	total: 265ms	remaining: 1m 5s
4:	learn: 0.6763414	total: 333ms	remaining: 1m 6s
5:	learn: 0.6719162	total: 394ms	remaining: 1m 5s
6:	learn: 0.6687237	total: 459ms	remaining: 1m 5s
7:	learn: 0.6658510	total: 522ms	remaining: 1m 4s
8:	learn: 0.66282

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Calibrated Classifier CV,0.89,0.88,0.85,0.91,0.89,0.95
Extra Trees Classifier,0.89,0.87,0.84,0.91,0.89,0.96
Random Forest Classifier,0.89,0.87,0.81,0.95,0.89,0.96
Logistic Regression,0.89,0.87,0.83,0.92,0.88,0.95
Ridge Classifier,0.89,0.87,0.83,0.91,0.88,
Ridge Classifier CV,0.89,0.87,0.83,0.91,0.88,
Passive Aggressive Classifier,0.89,0.86,0.8,0.94,0.88,
SGD Classifier,0.88,0.86,0.84,0.89,0.88,
Perceptron,0.87,0.86,0.86,0.86,0.87,
Support Vector Classifier,0.86,0.83,0.72,0.97,0.85,


In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 387
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 137
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01446
0:	learn: 0.6897635	total: 64.7ms	remaining: 1m 4s
1:	learn: 0.6864827	total: 132ms	remaining: 1m 5s
2:	learn: 0.6834771	total: 199ms	remaining: 1m 6s
3:	learn: 0.6803974	total: 266ms	remaining: 1m 6s
4:	learn: 0.6760893	total: 337ms	remaining: 1m 7s
5:	learn: 0.6718704	total: 403ms	remaining: 1m 6s
6:	learn: 0.6685510	total: 468ms	remaining: 1m 6s
7:	learn: 0.6657119	total: 537ms	remaining: 1m 6s
8:	learn: 0.6628325	total: 601ms	remaining: 1m 6s
9:	learn: 0.6595442	

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ridge Classifier CV,0.89,0.88,0.85,0.9,0.89,
Logistic Regression,0.88,0.87,0.85,0.89,0.88,0.93
Ridge Classifier,0.88,0.87,0.89,0.85,0.88,
Bernoulli Naive Bayes,0.88,0.86,0.87,0.86,0.88,0.93
Calibrated Classifier CV,0.88,0.86,0.85,0.87,0.87,0.93
SGD Classifier,0.86,0.85,0.88,0.82,0.86,
Perceptron,0.86,0.84,0.86,0.83,0.86,
Support Vector Classifier,0.86,0.83,0.73,0.96,0.85,
CatBoost Classifier,0.85,0.83,0.79,0.88,0.85,0.93
XGBoost Classifier,0.85,0.82,0.78,0.87,0.84,0.92


In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1864

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001927 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 340
[LightGBM] [Info] Number of data points in the train set: 1864, number of used features: 119
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01344
0:	learn: 0.6896618	total: 61.9ms	remaining: 1m 1s
1:	learn: 0.6860003	total: 125ms	remaining: 1m 2s
2:	learn: 0.6814009	total: 189ms	remaining: 1m 2s
3:	learn: 0.6778039	total: 250ms	remaining: 1m 2s
4:	learn: 0.6747704	total: 312ms	remaining: 1m 2s
5:	learn: 0.6713530	total: 373ms	remaining: 1m 1s
6:	learn: 0.6681671	total: 434ms	remaining: 1m 1s
7:	learn: 0.6650257	total: 493ms	remaining: 1m 1s
8:	learn: 0.6618216	total: 558ms	remaining: 1m 1s
9:	learn: 0.6592222	to

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.91,0.89,0.85,0.93,0.9,0.96
Bernoulli Naive Bayes,0.9,0.88,0.83,0.94,0.9,0.94
Random Forest Classifier,0.89,0.87,0.82,0.93,0.88,0.96
Passive Aggressive Classifier,0.89,0.87,0.85,0.9,0.89,
Perceptron,0.89,0.87,0.86,0.88,0.88,
SGD Classifier,0.89,0.87,0.85,0.89,0.88,
Logistic Regression,0.88,0.87,0.84,0.9,0.88,0.95
Calibrated Classifier CV,0.88,0.86,0.85,0.87,0.87,0.95
Ridge Classifier,0.88,0.86,0.84,0.89,0.88,
Ridge Classifier CV,0.88,0.86,0.84,0.89,0.88,


TfidVectorize

In [None]:
Tfid_Data = TfidfVectorizer(ngram_range=(1,3)).fit(dataset.after)
X_Tfid_Matrix = Tfid_Data.transform(dataset.after)

X_Tfid = pd.DataFrame(data=X_Tfid_Matrix.toarray() , columns = Tfid_Data.get_feature_names_out())
X_Tfid

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai cabang bank,abai hubung,abai hubung cs,...,zaman,zaman gw,zaman gw kuliah,zaman pake,zaman pake kartu,zharif,zte,zte bca,zte bca mobile,zuu
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2714,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2715,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2716,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
Tfid_df = pd.concat([X_Tfid , dataset.complaint] , axis=1)
Tfid_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai cabang bank,abai hubung,abai hubung cs,...,zaman gw,zaman gw kuliah,zaman pake,zaman pake kartu,zharif,zte,zte bca,zte bca mobile,zuu,complaint
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [None]:
X = Tfid_df.drop(["complaint"] , axis=1)
y = Tfid_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002342 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2211
[LightGBM] [Info] Number of data points in the train set: 2038, number of used features: 131
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.457311 -> initscore=-0.171172
[LightGBM] [Info] Start training from score -0.171172
Learning rate set to 0.013963
0:	learn: 0.6888937	total: 81ms	remaining: 1m 20s
1:	learn: 0.6843124	total: 144ms	remaining: 1m 12s
2:	learn: 0.6809473	total: 210ms	remaining: 1m 9s
3:	learn: 0.6771184	total: 279ms	remaining: 1m 9s
4:	learn: 0.6741173	total: 344ms	remaining: 1m 8s
5:	learn: 0.6705991	total: 408ms	remaining: 1m 7s
6:	learn: 0.6674127	total: 475ms	remaining: 1m 7s
7:	learn: 0.6637832	total: 540ms	remaining: 1m 6s
8:	learn: 0.65

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SGD Classifier,0.9,0.88,0.84,0.92,0.89,
Ridge Classifier CV,0.9,0.88,0.82,0.95,0.89,
Calibrated Classifier CV,0.89,0.88,0.84,0.91,0.89,0.96
Extra Trees Classifier,0.89,0.87,0.79,0.96,0.88,0.96
Passive Aggressive Classifier,0.89,0.87,0.81,0.95,0.88,
Ridge Classifier,0.88,0.86,0.78,0.95,0.87,
Random Forest Classifier,0.88,0.85,0.77,0.95,0.87,0.95
Perceptron,0.87,0.84,0.74,0.97,0.86,
XGBoost Classifier,0.86,0.84,0.78,0.9,0.86,0.93
Logistic Regression,0.86,0.83,0.72,0.97,0.85,0.95


In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.005318 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2716
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 159
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01446
0:	learn: 0.6883097	total: 95.4ms	remaining: 1m 35s
1:	learn: 0.6836865	total: 164ms	remaining: 1m 21s
2:	learn: 0.6787938	total: 234ms	remaining: 1m 17s
3:	learn: 0.6742453	total: 309ms	remaining: 1m 16s
4:	learn: 0.6702095	total: 383ms	remaining: 1m 16s
5:	learn: 0.6672294	total: 479ms	remaining: 1m 19s
6:	learn: 0.6627636	total: 553ms	remaining: 1m 18s
7:	learn: 0.6593239	total: 627ms	remaining: 1m 17s
8:	learn: 0.6557198	total: 713ms	remaining: 1m 18s
9:	learn: 

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.88,0.81,0.96,0.89,0.96
SGD Classifier,0.9,0.88,0.83,0.93,0.89,
Ridge Classifier CV,0.9,0.88,0.83,0.95,0.89,
Calibrated Classifier CV,0.89,0.87,0.83,0.92,0.89,0.96
Ridge Classifier,0.89,0.87,0.8,0.94,0.88,
Random Forest Classifier,0.89,0.86,0.79,0.95,0.88,0.95
Bernoulli Naive Bayes,0.88,0.86,0.77,0.96,0.87,0.94
Passive Aggressive Classifier,0.88,0.86,0.81,0.92,0.88,
Perceptron,0.88,0.84,0.74,0.98,0.86,
Logistic Regression,0.87,0.84,0.77,0.94,0.86,0.95


In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1864

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001920 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1990
[LightGBM] [Info] Number of data points in the train set: 1864, number of used features: 119
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01344
0:	learn: 0.6885651	total: 71.5ms	remaining: 1m 11s
1:	learn: 0.6833495	total: 129ms	remaining: 1m 4s
2:	learn: 0.6790726	total: 188ms	remaining: 1m 2s
3:	learn: 0.6752338	total: 244ms	remaining: 1m
4:	learn: 0.6706680	total: 302ms	remaining: 1m
5:	learn: 0.6671816	total: 361ms	remaining: 59.7s
6:	learn: 0.6636013	total: 423ms	remaining: 60s
7:	learn: 0.6590778	total: 479ms	remaining: 59.4s
8:	learn: 0.6551900	total: 539ms	remaining: 59.4s
9:	learn: 0.6517342	total: 5

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Extra Trees Classifier,0.9,0.88,0.82,0.94,0.89,0.95
Ridge Classifier CV,0.9,0.88,0.83,0.93,0.89,
SGD Classifier,0.89,0.88,0.86,0.9,0.89,
Calibrated Classifier CV,0.89,0.87,0.85,0.89,0.88,0.96
Bernoulli Naive Bayes,0.89,0.87,0.82,0.94,0.89,0.94
Passive Aggressive Classifier,0.89,0.87,0.83,0.92,0.89,
Ridge Classifier,0.89,0.87,0.82,0.94,0.89,
Random Forest Classifier,0.88,0.85,0.78,0.94,0.87,0.96
Perceptron,0.88,0.85,0.75,0.97,0.87,
CatBoost Classifier,0.87,0.85,0.8,0.9,0.86,0.94


## Case 6 ( text len and ngrams 1,3 )

In [None]:
data_len

Unnamed: 0,after,complaint,text_len
0,mbanking lot transaksi lelet banget lokasi nyala,1.0,7
1,kartu kadaluwarsa urus semua cabang bca,0.0,6
2,bca mobile merah y,1.0,4
3,bca mobile merah y,1.0,4
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0,12
...,...,...,...
2000,kak maaf siang temen tf bca bca temen masuk m...,1.0,17
2001,kpan bner nya mbangking tarik tunai kgk,1.0,7
2002,haduhh jam jam sibuk eror mbakingnya,1.0,6
2003,kakk tanyaa udah bca mobile kartu yg ngajuin...,1.0,14


In [None]:
Count_Data = CountVectorizer(ngram_range=(1,3)).fit(data_len.after)
X_Count_matrix = Count_Data.transform(data_len.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())

In [None]:
Count_df = pd.concat([X_Count , data_len.text_len ,data_len.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abah,abah anis,abai,abai terima,abai terima kasih,abis,abis bca,abis bca baik,...,yuuu bawa,yuuu bawa kim,yv,yv simpan,zte,zte bca,zte bca mobile,zuu,text_len,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,1.0


In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 582, number of negative: 921
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 299
[LightGBM] [Info] Number of data points in the train set: 1503, number of used features: 99
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.387226 -> initscore=-0.458990
[LightGBM] [Info] Start training from score -0.458990
Learning rate set to 0.01226
0:	learn: 0.6904498	total: 58.6ms	remaining: 58.5s
1:	learn: 0.6881316	total: 107ms	remaining: 53.2s
2:	learn: 0.6850088	total: 151ms	remaining: 50.2s
3:	learn: 0.6822551	total: 196ms	remaining: 48.8s
4:	learn: 0.6787220	total: 240ms	remaining: 47.7s
5:	learn: 0.6764212	total: 297ms	remaining: 49.2s
6:	learn: 0.6740815	total: 342ms	remaining: 48.6s
7:	learn: 0.6717719	total: 390ms	remaining: 48.4s
8:	learn: 0.669818

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SGD Classifier,0.86,0.84,0.83,0.85,0.86,
Perceptron,0.83,0.78,0.67,0.93,0.81,
XGBoost Classifier,0.82,0.77,0.68,0.87,0.8,0.9
Ridge Classifier,0.81,0.76,0.68,0.85,0.79,
Extra Trees Classifier,0.81,0.75,0.64,0.9,0.79,0.92
Bagging Classifier,0.81,0.75,0.64,0.89,0.79,0.88
Calibrated Classifier CV,0.8,0.76,0.69,0.84,0.79,0.9
Decision Tree,0.8,0.76,0.68,0.85,0.79,0.79
CatBoost Classifier,0.8,0.74,0.61,0.92,0.78,0.9
AdaBoost Classifier,0.8,0.73,0.62,0.9,0.78,0.87


In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
y_train_over.value_counts('complaint')

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 921, number of negative: 921
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002928 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 340
[LightGBM] [Info] Number of data points in the train set: 1842, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.013373
0:	learn: 0.6914645	total: 52.7ms	remaining: 52.6s
1:	learn: 0.6900776	total: 115ms	remaining: 57.6s
2:	learn: 0.6884085	total: 160ms	remaining: 53.1s
3:	learn: 0.6856569	total: 205ms	remaining: 51s
4:	learn: 0.6832950	total: 251ms	remaining: 49.9s
5:	learn: 0.6817140	total: 295ms	remaining: 48.9s
6:	learn: 0.6806918	total: 343ms	remaining: 48.6s
7:	learn: 0.6789616	total: 393ms	remaining: 48.8s
8:	learn: 0.6775930	total: 442ms	remaining: 48.7s
9:	learn: 0.6755804	tot

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ridge Classifier CV,0.84,0.83,0.84,0.82,0.84,
Logistic Regression,0.83,0.82,0.88,0.77,0.84,0.89
SGD Classifier,0.81,0.81,0.91,0.73,0.82,
Ridge Classifier,0.81,0.8,0.86,0.75,0.82,
CatBoost Classifier,0.81,0.79,0.81,0.77,0.81,0.89
Perceptron,0.8,0.8,0.87,0.73,0.81,
Gradient Boosting Classifier,0.8,0.79,0.85,0.74,0.81,0.88
XGBoost Classifier,0.8,0.78,0.79,0.76,0.8,0.89
Bernoulli Naive Bayes,0.79,0.76,0.77,0.75,0.79,0.88
LightGBM Classifier,0.78,0.76,0.78,0.73,0.78,0.86


In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1164

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000996 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 236
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.010992
0:	learn: 0.6918926	total: 76.4ms	remaining: 1m 16s
1:	learn: 0.6905226	total: 126ms	remaining: 1m 2s
2:	learn: 0.6887324	total: 165ms	remaining: 54.9s
3:	learn: 0.6870310	total: 203ms	remaining: 50.5s
4:	learn: 0.6848777	total: 243ms	remaining: 48.4s
5:	learn: 0.6831946	total: 283ms	remaining: 46.9s
6:	learn: 0.6813354	total: 323ms	remaining: 45.8s
7:	learn: 0.6797826	total: 360ms	remaining: 44.6s
8:	learn: 0.6787278	total: 397ms	remaining: 43.7s
9:	learn: 0.6768591	t

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Perceptron,0.83,0.82,0.86,0.78,0.83,
XGBoost Classifier,0.83,0.81,0.8,0.82,0.83,0.9
Logistic Regression,0.82,0.79,0.79,0.8,0.81,0.9
CatBoost Classifier,0.82,0.79,0.77,0.81,0.81,0.89
Extra Trees Classifier,0.82,0.78,0.73,0.84,0.81,0.9
AdaBoost Classifier,0.81,0.76,0.68,0.86,0.8,0.87
SGD Classifier,0.81,0.76,0.65,0.9,0.8,
Calibrated Classifier CV,0.8,0.78,0.78,0.77,0.8,0.89
Passive Aggressive Classifier,0.8,0.78,0.78,0.78,0.8,
Ridge Classifier,0.8,0.77,0.77,0.78,0.8,


In [None]:
Tfid_Data = TfidfVectorizer(ngram_range=(1,3)).fit(data_len.after)
X_Tfid_Matrix = Tfid_Data.transform(data_len.after)

X_Tfid = pd.DataFrame(data=X_Tfid_Matrix.toarray() , columns = Tfid_Data.get_feature_names_out())

In [None]:
Tfid_df = pd.concat([X_Tfid , data_len.text_len,data_len.complaint] , axis=1)
Tfid_df.head()

Unnamed: 0,aaa,aaa shenina,abah,abah anis,abai,abai terima,abai terima kasih,abis,abis bca,abis bca baik,...,yuuu bawa,yuuu bawa kim,yv,yv simpan,zte,zte bca,zte bca mobile,zuu,text_len,complaint
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12,1.0


In [None]:
X = Tfid_df.drop(["complaint"] , axis=1)
y = Tfid_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
model = classifier_report(X_train , X_test , y_train , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 582, number of negative: 921
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1681
[LightGBM] [Info] Number of data points in the train set: 1503, number of used features: 99
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.387226 -> initscore=-0.458990
[LightGBM] [Info] Start training from score -0.458990
Learning rate set to 0.01226
0:	learn: 0.6902873	total: 59.9ms	remaining: 59.8s
1:	learn: 0.6877311	total: 124ms	remaining: 1m 2s
2:	learn: 0.6846754	total: 188ms	remaining: 1m 2s
3:	learn: 0.6820880	total: 249ms	remaining: 1m 1s
4:	learn: 0.6799997	total: 311ms	remaining: 1m 1s
5:	learn: 0.6763442	total: 381ms	remaining: 1m 3s
6:	learn: 0.6736197	total: 452ms	remaining: 1m 4s
7:	learn: 0.6705722	total: 515ms	remaining: 1m 3s
8:	learn: 0.66869

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Perceptron,0.82,0.79,0.75,0.83,0.81,
Ridge Classifier CV,0.82,0.76,0.66,0.9,0.8,
Calibrated Classifier CV,0.81,0.77,0.71,0.84,0.8,0.91
Passive Aggressive Classifier,0.8,0.74,0.63,0.89,0.78,
Bagging Classifier,0.79,0.73,0.62,0.87,0.78,0.87
CatBoost Classifier,0.79,0.72,0.59,0.9,0.77,0.89
Ridge Classifier,0.79,0.72,0.59,0.9,0.77,
Random Forest Classifier,0.79,0.69,0.55,0.95,0.76,0.9
XGBoost Classifier,0.78,0.72,0.64,0.84,0.77,0.88
AdaBoost Classifier,0.78,0.72,0.63,0.84,0.77,0.85


In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
y_train_over.value_counts('complaint')

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


In [None]:
model = classifier_report(X_train_over , X_test , y_train_over , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 921, number of negative: 921
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002634 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3091
[LightGBM] [Info] Number of data points in the train set: 1842, number of used features: 184
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.013373
0:	learn: 0.6904919	total: 98.3ms	remaining: 1m 38s
1:	learn: 0.6864854	total: 166ms	remaining: 1m 22s
2:	learn: 0.6829359	total: 232ms	remaining: 1m 17s
3:	learn: 0.6799379	total: 308ms	remaining: 1m 16s
4:	learn: 0.6777711	total: 372ms	remaining: 1m 14s
5:	learn: 0.6744244	total: 441ms	remaining: 1m 13s
6:	learn: 0.6711035	total: 513ms	remaining: 1m 12s
7:	learn: 0.6683256	total: 576ms	remaining: 1m 11s
8:	learn: 0.6651307	total: 639ms	remaining: 1m 10s
9:	learn: 0.6628658	total: 702ms	remaining: 1m 9s
10:	learn: 0.6605187	tota

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
SGD Classifier,0.83,0.79,0.73,0.87,0.82,
Ridge Classifier,0.82,0.77,0.68,0.89,0.81,
CatBoost Classifier,0.81,0.76,0.67,0.88,0.8,0.9
AdaBoost Classifier,0.81,0.76,0.68,0.85,0.8,0.85
Ridge Classifier CV,0.81,0.76,0.66,0.89,0.8,
XGBoost Classifier,0.8,0.76,0.68,0.85,0.79,0.89
Calibrated Classifier CV,0.8,0.73,0.61,0.9,0.78,0.92
Bagging Classifier,0.79,0.72,0.62,0.87,0.77,0.86
Extra Trees Classifier,0.79,0.71,0.58,0.91,0.77,0.91
Random Forest Classifier,0.79,0.7,0.56,0.93,0.76,0.9


In [None]:
from imblearn.under_sampling import RandomUnderSampler

under = RandomUnderSampler()
X_train_under , y_train_under= under.fit_resample(X_train , y_train)
display(y_train_under.value_counts('complaint'))
len(y_train_under)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
0.0,0.5
1.0,0.5


1164

In [None]:
model = classifier_report(X_train_under , X_test , y_train_under , y_test, print=False )
model

[LightGBM] [Info] Number of positive: 932, number of negative: 932
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002119 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1971
[LightGBM] [Info] Number of data points in the train set: 1864, number of used features: 118
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Learning rate set to 0.01344
0:	learn: 0.6886884	total: 45ms	remaining: 45s
1:	learn: 0.6827298	total: 84.2ms	remaining: 42s
2:	learn: 0.6774410	total: 125ms	remaining: 41.7s
3:	learn: 0.6736723	total: 165ms	remaining: 41s
4:	learn: 0.6692447	total: 202ms	remaining: 40.2s
5:	learn: 0.6657129	total: 238ms	remaining: 39.5s
6:	learn: 0.6613839	total: 276ms	remaining: 39.1s
7:	learn: 0.6584555	total: 313ms	remaining: 38.8s
8:	learn: 0.6539065	total: 356ms	remaining: 39.2s
9:	learn: 0.6485804	total: 3

Unnamed: 0_level_0,Accuracy,F1-Score,Recall,Precision,Balance,ROC-AUC
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Ridge Classifier CV,0.9,0.89,0.87,0.91,0.9,
Calibrated Classifier CV,0.89,0.88,0.87,0.89,0.89,0.96
Bernoulli Naive Bayes,0.89,0.88,0.85,0.91,0.89,0.94
Ridge Classifier,0.89,0.88,0.86,0.9,0.89,
Extra Trees Classifier,0.89,0.87,0.82,0.94,0.89,0.96
Random Forest Classifier,0.89,0.86,0.81,0.93,0.88,0.96
Perceptron,0.88,0.87,0.89,0.85,0.88,
Logistic Regression,0.87,0.85,0.82,0.88,0.87,0.95
CatBoost Classifier,0.87,0.85,0.82,0.88,0.86,0.94
Bagging Classifier,0.87,0.85,0.81,0.89,0.86,0.93



## Kesimpulan:

Setelah mencoba berbagai model, Logistic Regression dan Ridge Classifier menunjukkan performa terbaik. Berikut kesimpulan untuk kedua model tersebut:

1.  Logistic Regression:


*   
Model ini memberikan akurasi yang tinggi dengan F1 score dan recall yang kuat. Logistic Regression sangat baik dalam mendeteksi positive class (recall) dan memiliki keseimbangan yang baik antara precision dan recall.
*   kurasi: 0.89, F1 Score: 0.88, Recall: 0.89, Precision: 0.89.



2.   Ridge Classifier


*   Ridge Classifier juga menunjukkan hasil yang sangat mirip dengan Logistic Regression, terutama dalam hal akurasi dan F1 score. Namun, recall-nya sedikit lebih rendah dibandingkan Logistic Regression, yang berarti model ini sedikit kurang sensitif dalam mendeteksi positive class.
*   Akurasi: 0.89, F1 Score: 0.88, Recall: 0.87.




Kesimpulan:
Kedua model ini unggul dibandingkan model lainnya dalam hal akurasi dan keseimbangan antara precision dan recall.
Jika dihadapkan pada pilihan, Logistic Regression mungkin sedikit lebih unggul karena memiliki recall yang lebih tinggi, menunjukkan performa yang lebih baik dalam mendeteksi positive cases.

# Logistic Regression

dengan data case 4 ( len + count + over)

In [None]:
data_len = pd.read_csv("modelling_data_final.csv")
data_len.head()

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0


In [None]:
data_len["text_len"] = data_len['after'].apply(lambda x: len(x.split()))
data_len.head()

Unnamed: 0,after,complaint,text_len
0,mbanking lot transaksi lelet banget lokasi nyala,1.0,7
1,kartu kadaluwarsa urus semua cabang bca,0.0,6
2,bca mobile merah y,1.0,4
3,bca mobile merah y,1.0,4
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0,12


In [None]:
Count_Data = CountVectorizer(ngram_range=(1,2)).fit(data_len.after)
X_Count_matrix = Count_Data.transform(data_len.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())
X_Count

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yummy drool,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2716,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Count_df = pd.concat([X_Count ,data_len.text_len ,  data_len.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,text_len,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,1.0


In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
model = LogisticRegression()
model.fit(X_train_over , y_train_over)
y_pred = model.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.89      0.90       375
         1.0       0.87      0.89      0.88       305

    accuracy                           0.89       680
   macro avg       0.89      0.89      0.89       680
weighted avg       0.89      0.89      0.89       680



In [None]:
def con_mat(y_train , y_pred) :

  cm = confusion_matrix(y_test, y_pred)


  fig = ff.create_annotated_heatmap(
      z=cm,
      x=['Predicted 0', 'Predicted 1'],
      y=['Actual 0', 'Actual 1'],
      colorscale='Blues',
      showscale=True
  )

  fig.update_layout(
      title='Confusion Matrix',
      xaxis_title='Predicted Labels',
      yaxis_title='Actual Labels'
  )

  fig.show()

con_mat(y_test , y_pred)



In [None]:
from sklearn.model_selection import GridSearchCV, KFold


param_grid = {

    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga'],
    'max_iter': [100, 200, 300],
    'class_weight': ['balanced', None],

}

kf = KFold(n_splits=5, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', cv=kf, n_jobs=-1, verbose=1)


grid_search.fit(X_train_over, y_train_over)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


In [None]:
Best_Parameters = {'C': 10, 'class_weight': 'balanced', 'max_iter': 300, 'penalty': 'l2', 'solver': 'saga'}

LogisticRegression = LogisticRegression(**Best_Parameters)
LogisticRegression.fit(X_train_over, y_train_over)
y_pred = LogisticRegression.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         0.0       0.91      0.89      0.90       375
         1.0       0.87      0.89      0.88       305

    accuracy                           0.89       680
   macro avg       0.89      0.89      0.89       680
weighted avg       0.89      0.89      0.89       680



In [None]:
con_mat(y_test , y_pred)

# Ridge Classifier

In [None]:
data = pd.read_csv("modelling_data_final.csv")
data.head()

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0


In [None]:
Count_Data = CountVectorizer(ngram_range=(1,2)).fit(data.after)
X_Count_matrix = Count_Data.transform(data.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())

In [None]:
Count_df = pd.concat([X_Count ,data.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0


In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
y_train_over.value_counts('complaint')

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


In [None]:
model = RidgeClassifier()
model.fit(X_train_over,y_train_over)
y_pred = model.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.88      0.89       375
         1.0       0.86      0.87      0.86       305

    accuracy                           0.88       680
   macro avg       0.87      0.88      0.88       680
weighted avg       0.88      0.88      0.88       680



In [None]:
con_mat(y_test , y_pred)

In [None]:
from sklearn.model_selection import GridSearchCV, KFold


params = {
    'alpha': [1.0, 10.0, 100.0],
    'solver': ['auto', 'svd', 'cholesky', 'saga'],
    'class_weight': [None, 'balanced'],
}

kf = KFold(n_splits=3, shuffle=True, random_state=42)

grid_search = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=kf)


grid_search.fit(X_train_over, y_train_over)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Cross-Validation Score:", best_score)


Best Parameters: {'alpha': 10.0, 'class_weight': 'balanced', 'solver': 'auto'}
Best Cross-Validation Score: 0.8598581372516575


In [None]:
Best_Parameters = {'alpha': 10.0, 'class_weight': 'balanced', 'solver': 'auto'}

RidgeClassifier = RidgeClassifier(**Best_Parameters)
RidgeClassifier.fit(X_train_over, y_train_over)
y_pred = RidgeClassifier.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         0.0       0.88      0.93      0.90       375
         1.0       0.90      0.85      0.87       305

    accuracy                           0.89       680
   macro avg       0.89      0.89      0.89       680
weighted avg       0.89      0.89      0.89       680



In [None]:
con_mat(y_test , y_pred)

# ENSEMBLE MODEL

## Votting

In [None]:
from sklearn.ensemble import BaggingClassifier , StackingClassifier
from sklearn.ensemble import VotingClassifier


In [None]:
data_len = pd.read_csv("modelling_data_final.csv")
data_len.head()

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0


In [None]:
data_len["text_len"] = data_len['after'].apply(lambda x: len(x.split()))
data_len.head()

Unnamed: 0,after,complaint,text_len
0,mbanking lot transaksi lelet banget lokasi nyala,1.0,7
1,kartu kadaluwarsa urus semua cabang bca,0.0,6
2,bca mobile merah y,1.0,4
3,bca mobile merah y,1.0,4
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0,12


In [None]:
Count_Data = CountVectorizer(ngram_range=(1,2)).fit(data_len.after)
X_Count_matrix = Count_Data.transform(data_len.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())
X_Count

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yummy drool,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2716,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Count_df = pd.concat([X_Count ,data_len.text_len ,  data_len.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,text_len,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,1.0


In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
models_vote = [('CC',CalibratedClassifierCV()),('LR',LogisticRegression(random_state=69)),('XT',ExtraTreesClassifier(random_state=60))]
votting = VotingClassifier(estimators=models_vote,  voting='soft')
votting.fit(X_train_over, y_train_over)

In [None]:
y_pred = votting.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         0.0       0.90      0.85      0.88       375
         1.0       0.83      0.89      0.86       305

    accuracy                           0.87       680
   macro avg       0.87      0.87      0.87       680
weighted avg       0.87      0.87      0.87       680



In [None]:
con_mat(y_test , y_pred)

## Stacking

In [None]:
stack0 = list()
stack0.append(('GBM', GradientBoostingClassifier()))
stack0.append(('EG', ExtraTreesClassifier()))
stack0.append(('XG', XGBClassifier()))
stack0.append(('RC', RidgeClassifier()))
stack0.append(('LGB', LGBMClassifier()))


stack1 = LogisticRegression(random_state=40)

model_stack = StackingClassifier(estimators=stack0, final_estimator=stack1, cv=5)

model_stack.fit(X_train_over, y_train_over)

[LightGBM] [Info] Number of positive: 1106, number of negative: 1106
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.004142 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 418
[LightGBM] [Info] Number of data points in the train set: 2212, number of used features: 140
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 884, number of negative: 885
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 311
[LightGBM] [Info] Number of data points in the train set: 1769, number of used features: 100
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499717 -> initscore=-0.001131
[

In [None]:
y_pred = model_stack.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.90      0.90       375
         1.0       0.88      0.87      0.87       305

    accuracy                           0.89       680
   macro avg       0.88      0.88      0.88       680
weighted avg       0.89      0.89      0.89       680



In [None]:
con_mat(y_test , y_pred)

#KESIMPULAN

Logistic Regresion setelah hyper tuning tetap memegang acuraccy score tertinggi dengan recall dan precision yang seimbang , dibandingkan ensemble model dan ridge clasifier

# FINAL MODEL

In [None]:
data_len = pd.read_csv("modelling_data_final.csv")
data_len.head()

Unnamed: 0,after,complaint
0,mbanking lot transaksi lelet banget lokasi nyala,1.0
1,kartu kadaluwarsa urus semua cabang bca,0.0
2,bca mobile merah y,1.0
3,bca mobile merah y,1.0
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0


In [None]:
data_len["text_len"] = data_len['after'].apply(lambda x: len(x.split()))
data_len.head()

Unnamed: 0,after,complaint,text_len
0,mbanking lot transaksi lelet banget lokasi nyala,1.0,7
1,kartu kadaluwarsa urus semua cabang bca,0.0,6
2,bca mobile merah y,1.0,4
3,bca mobile merah y,1.0,4
4,ngisi pulsa bca mobile pulsa kaga masuk masuk...,1.0,12


In [None]:
Count_Data = CountVectorizer(ngram_range=(1,2)).fit(data_len.after)
X_Count_matrix = Count_Data.transform(data_len.after)

X_Count = pd.DataFrame(data=X_Count_matrix.toarray() , columns = Count_Data.get_feature_names_out())
X_Count

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yummy drool,yuuu,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2713,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2714,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2715,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2716,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
Count_df = pd.concat([X_Count ,data_len.text_len ,  data_len.complaint] , axis=1)
Count_df.head()

Unnamed: 0,aaa,aaa shenina,abad,abah,abah anis,abai,abai cabang,abai hubung,abai terima,abis,...,yuuu bawa,zaman,zaman gw,zaman pake,zharif,zte,zte bca,zuu,text_len,complaint
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,7,1.0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,6,0.0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,1.0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,12,1.0


In [None]:
X = Count_df.drop(["complaint"] , axis=1)
y = Count_df['complaint']

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X,y , test_size=0.25 , random_state = 33)

In [None]:
from imblearn.over_sampling import SMOTE

over = SMOTE()
X_train_over , y_train_over = over.fit_resample(X_train , y_train)
display(y_train_over.value_counts('complaint'))
len(y_train_over)

Unnamed: 0_level_0,proportion
complaint,Unnamed: 1_level_1
1.0,0.5
0.0,0.5


2212

In [None]:
Best_Parameters = {'C': 10, 'class_weight': 'balanced', 'max_iter': 300, 'penalty': 'l2', 'solver': 'saga'}

LogisticRegression = LogisticRegression(**Best_Parameters)
LogisticRegression.fit(X_train_over, y_train_over)
y_pred = LogisticRegression.predict(X_test)
print(classification_report(y_test , y_pred))

              precision    recall  f1-score   support

         0.0       0.89      0.90      0.90       375
         1.0       0.87      0.87      0.87       305

    accuracy                           0.89       680
   macro avg       0.88      0.88      0.88       680
weighted avg       0.89      0.89      0.89       680



In [None]:
import pickle

model = LogisticRegression

# Menyimpan model ke file
with open('LogisticRegression_final_model.pkl', 'wb') as file:
    pickle.dump(model, file)
