In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from tkinter import Tk
from tkinter.filedialog import askopenfilename

def select_file():
    Tk().withdraw() 
    filename = askopenfilename()  
    return filename

filename1 = select_file()
df1 = pd.read_csv(filename1)

filename2 = select_file()
df2 = pd.read_csv(filename2)

print("Dataframe 1:")
print(df1.head())

print("\nDataframe 2:")
print(df2.head())

print("\nKolom dalam Dataframe 1:")
print(df1.columns)

print("\nKolom dalam Dataframe 2:")
print(df2.columns)

if 'text' in df1.columns and 'label' in df1.columns:
    X = df1['text']
    y = df1['label']
else:
    print("Kolom 'text' atau 'label' tidak ditemukan dalam Dataframe 1")
    print("Kolom yang tersedia: ", df1.columns)
    raise KeyError("Kolom 'text' atau 'label' tidak ditemukan dalam Dataframe 1")

stop_words = set(stopwords.words('english'))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(stop_words=stop_words)

pipeline = Pipeline([
    ('tfidf', tfidf),
    ('clf', RandomForestClassifier())
])

parameters = {
    'tfidf__max_df': [0.75, 1.0],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__n_estimators': [100, 200],
    'clf__max_depth': [None, 10, 20]
}

grid_search = GridSearchCV(pipeline, parameters, cv=5, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

print(f"Best parameters: {grid_search.best_params_}")
print(f"Best score: {grid_search.best_score_}")

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

print("Classification Report:")
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Roma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataframe 1:
   Unnamed: 0                                               text    category  \
0           0  Warnanya tdk sesuai pesanan. Tdk ada chat/pemb...  elektronik   
1           1                 Thanks gan barang bagus.smoga awet   handphone   
2           2   barang sesuai deskripsi.. laptop dapat di-charge  elektronik   
3           3            Goood product..........    ............   handphone   
4           4                            Bagus pengiriman cepat.   handphone   

                                        product_name  product_id  
0  USB HUB 3.0 7 port by DIGIGEAR HIGH SPEED 1.2 ...   170197447  
1  Holder anti hujan &amp; copet untuk smartphone...    37564148  
2  Adaptor Charger Laptop Toshiba Satellite C800 ...   254339251  
3  Silikon Case Blackberry Bold 9000 Hitam Gratis...     4837401  
4  Audio Splitter Jack 3.5mm to dual female U Sha...   270447288  

Dataframe 2:
   Unnamed: 0.2  Unnamed: 0.1  Unnamed: 0  \
0             0             0           1   


KeyError: "Kolom 'text' atau 'label' tidak ditemukan dalam Dataframe 1"