In [18]:
import pandas as pd
import re
import emoji
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

In [19]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ILYAS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ILYAS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [20]:
# Load the dataset
df = pd.read_csv('Data.csv')

In [21]:
# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [22]:
# Function to convert emojis to text
def emoji_to_text(text):
    return emoji.demojize(text)

In [23]:
# Function to preprocess the text
def preprocess_text(text):
    # Check if the text is not a string (it could be NaN or other types like float)
    if not isinstance(text, str):
        text = str(text)
    
    # 1. Convert text to lowercase
    text = text.lower()
    
    # 2. Remove links (URLs)
    text = re.sub(r'http\S+|www\S+', '', text)
    
    # 3. Remove numbers and punctuation
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    
    # 4. Convert emoji to text
    text = emoji_to_text(text)
    
    # 5. Remove short words (1 or 2 characters)
    text = ' '.join([word for word in text.split() if len(word) > 2])
    
    # 6. Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    # 7. Stemming (Optional: can be used in place of Lemmatization)
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    # 8. Lemmatization (Optional: can be used in place of Stemming)
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text


In [24]:
# Apply preprocessing to 'title' and 'body' columns
df['title'] = df['title'].apply(preprocess_text)
df['body'] = df['body'].apply(preprocess_text)


In [25]:
# Save the preprocessed data to a new CSV file
df.to_csv('Preprocessed_Data.csv', index=False)

In [26]:
# Display the first few rows of the preprocessed data
print(df.head())

         asin           name  rating               date  verified  \
0  B0000SX2UC          Janet       3   October 11, 2005     False   
1  B0000SX2UC     Luke Wyatt       1    January 7, 2004     False   
2  B0000SX2UC         Brooke       5  December 30, 2003     False   
3  B0000SX2UC  amy m. teague       3     March 18, 2004     False   
4  B0000SX2UC  tristazbimmer       4    August 28, 2005     False   

                                  title  \
0                        def best worst   
1               text messag doesnt work   
2                            love phone   
3                            love phone   
4  great phone servic option lousi case   

                                                body  helpfulVotes  
0  samsung awhil absolut doo doo read review dete...           1.0  
1  due softwar issu nokia sprint phone text messa...          17.0  
2  great reliabl phone also purchas phone samsung...           5.0  
3  love phone realli need one didnt expect price .

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

In [48]:
# Memuat data (pastikan Anda mengganti dengan path yang sesuai jika diperlukan)
df = pd.read_csv("Preprocessed_Data.csv")

In [49]:
# Menangani missing values, jika ada (misalnya, jika ada NaN di kolom 'review')
df = df.dropna(subset=['body'])

In [50]:
# Membagi data menjadi pelatihan dan pengujian (90% pelatihan, 10% pengujian)
X_train, X_test, y_train, y_test = train_test_split(df['body'], df['rating'], test_size=0.1, random_state=42)


In [51]:

# Mengubah teks menjadi fitur numerik menggunakan TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)  # Anda bisa sesuaikan 'max_features' sesuai kebutuhan
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [52]:
# Membuat model K-Nearest Neighbors (K-NN)
knn = KNeighborsClassifier(n_neighbors=3, algorithm='ball_tree')


In [53]:
# Melatih model
knn.fit(X_train_tfidf, y_train)




In [54]:
# Memprediksi data pengujian
y_pred = knn.predict(X_test_tfidf)


In [55]:

# Evaluasi model
print("Akurasi:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Akurasi: 0.5771732702542874
Classification Report:
               precision    recall  f1-score   support

           1       0.50      0.34      0.41      1275
           2       0.15      0.04      0.06       375
           3       0.17      0.05      0.08       457
           4       0.26      0.05      0.09       929
           5       0.62      0.91      0.74      3728

    accuracy                           0.58      6764
   macro avg       0.34      0.28      0.28      6764
weighted avg       0.49      0.58      0.50      6764

