# ***LABELING DATA***

In [None]:
import pandas as pd

data = pd.read_csv("hasil_preprocessing.csv")
data.info()

In [None]:
df = data.dropna()
df.info()

In [None]:
data = pd.DataFrame(df[['steming_data']])
data.head(5)

In [None]:
import pandas as pd

# fungsi untuk menentukan sentimen dan menghitung skor sentimen
def determine_sentiment(text):
  positive_count = sum(1 for word in text.split() if word in positive_lexicon)
  negative_count = sum(1 for word in text.split() if word in negative_lexicon)
  score = positive_count - negative_count # hitung skor sentimen
  if score >= 0:
    sentiment = "Positif"
  elif score < 0:
    sentiment = "Negatif"
  else:
    sentiment = None
  return sentiment, score

# baca kamus leksikon positif dan negatif
positive_lexicon = set(pd.read_csv("Positive.tsv", sep="\t", header=None)[0])
negative_lexicon = set(pd.read_csv("Negative.tsv", sep="\t", header=None)[0])

# fungsi untuk mengganti nilai None pada sentiment
def replace_none_sentiment(sentiments):
  replace_flag = "Positif"
  for i in range(len(sentiments)):
    if sentiments[i] is None:
      sentiments[i] = replace_flag
      replace_flag = "Negatif" if replace_flag == "Positif" else "Positif"
  return sentiments

# terapkan fungsi determine_sentiment untuk mendapatkan kolom Sentiment dan Sentiment_Score
data[['Sentiment', 'Score']] = data['steming_data'].apply(lambda x: pd.Series(determine_sentiment(x)))

# ganti nilai None pada kolom Sentiment
data['Sentiment'] = replace_none_sentiment(data['Sentiment'].tolist())

df = pd.DataFrame(data[['steming_data', 'Score', 'Sentiment']])
df.head(5)

# tampilkan hasilnya
df.head()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sentiment_count = df['Sentiment'].value_counts()
sns.set_style('whitegrid')

fig, ax = plt.subplots(figsize=(6,4))
ax = sns.barplot(
    x=sentiment_count.index,
    y=sentiment_count.values,
    hue=sentiment_count.index,
    palette='pastel',
    legend=False
)
plt.title('Labeling Data', fontsize=14, pad=20)
plt.xlabel('Class Sentiment', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)

total = len(df['Sentiment'])

for i, count in enumerate(sentiment_count.values):
  percentage = f'{100 * count / total:.2f}%'
  ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')

plt.show()

In [None]:
df.to_csv('hasil_labeling.csv', encoding='utf8', index=False)

In [None]:
# eksport ke file csv terpisah berdasarkan label sentimen
for sentiment_label in ['Positif', 'Negatif']:
  # filter data sesuai dengan label sentimen
  filtered_data = df[df['Sentiment'] == sentiment_label]

  # tentukan nama file berdasarkan label sentimen
  filename = f"{sentiment_label}_dataset.csv"

  # eksport data yang telah difilter ke file csv
  filtered_data.to_csv(filename, index=False)

**WORD CLOUD**

In [None]:
import pandas as pd

data = pd.read_csv("hasil_labeling.csv")
data.head(5)

In [None]:
import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# pisahkan dataset berdasarkan sentimen
sentimen_Negative = data[data['Sentiment'] == 'Negatif']['steming_data'].str.cat(sep=' ')
sentimen_Positive = data[data['Sentiment'] == 'Positif']['steming_data'].str.cat(sep=' ')

In [None]:
# fungsi untuk membuat dan menampilkan WordCloud
def create_wordcloud(text, title):
  wordcloud = WordCloud(width=800, height=400, random_state=42, max_font_size=100, background_color='white').generate(text)

  plt.figure(figsize=(10, 5))
  plt.imshow(wordcloud, interpolation='bilinear')
  plt.axis('off')
  plt.title(title)
  plt.show()

In [None]:
# membuat WordCloud untuk sentimen negatif
create_wordcloud(sentimen_Negative, "Word Cloud Sentimen Negatif")

In [None]:
# membuat WordCloud untuk sentimen positif
create_wordcloud(sentimen_Positive, "Word Cloud Sentimen Positif")

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import pandas as pd


text = ' '.join(data['steming_data'].apply(lambda x: str(x) if isinstance(x, (str, int, float)) else ' '))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(10, 5))

# menampilkan word cloud dengan interpolasi gambar bilinear
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')

plt.show()

# ***SPLITTING DATA***

In [None]:
import pandas as pd

data = pd.read_csv("hasil_labeling.csv")
data.info()

In [None]:
df = df.dropna()

In [None]:
df.info()

In [None]:
from sklearn.model_selection import train_test_split

# bagi data menjadi data training dan data testing
X_train, X_test, y_train, y_test = train_test_split(df['steming_data'], df['Sentiment'], test_size=0.2, random_state=42)

In [None]:
# simpan data latih ke file csv
train_set = pd.DataFrame({'text': X_train, 'sentiment': y_train})
train_set.to_csv('train_data.csv', index=False)

In [None]:
# simpan data uji ke file csv
test_set = pd.DataFrame({'text': X_test, 'sentiment': y_test})
test_set.to_csv('test_data.csv', index=False)

In [None]:
# menampilkan informasi jumlah data
print(f'Jumlah Data Latih: {len(X_train)}')
print(f'Jumlah Data Uji: {len(X_test)}')

In [None]:
import matplotlib.pyplot as plt

# jumlah data latih dan data uji
train_size = len(X_train)
test_size = len(X_test)

# membuat plot
plt.figure(figsize=(6, 4))
bars = plt.bar(['Data Latih', 'Data uji'], [train_size, test_size], color=['#B0CBEF', '#F4B183'])

# menambahkan label untuk setiap bar (dalam kurung persentasi)
for bar in bars:
  height = bar.get_height()
  plt.text(bar.get_x() + bar.get_width()/2, height + 20, f'{height} ({height / (train_size + test_size) * 100:.2f}%)', ha='center', va='bottom')

plt.title("Splitting Data")
plt.xlabel('Jenis Data')
plt.ylabel('Jumlah Data')
plt.show()

# ***NAIVE BAYES CLASIFICATION***

In [None]:
df.info()

In [None]:
df.head()

**Transformasi Teks Menjadi Matrik Frekuensi Token Menghitung Probabilitas Prior (P(C)P(C))**

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

# inisialisasi CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['steming_data'])
features = vectorizer.get_feature_names_out()
df_tokens = pd.DataFrame(X.toarray(), columns=features)
df_combined = pd.concat([df_tokens, df['Sentiment']], axis=1)

# menghitung frekuensi token untuk setiap sentimen
frequency_positive = df_combined[df_combined['Sentiment'] == 'positif'].drop('Sentiment', axis=1).sum()
frequency_negative = df_combined[df_combined['Sentiment'] == 'negatif'].drop('Sentiment', axis=1).sum()

# menghitung probabilitas prior
sentiment_counts = df['Sentiment'].value_counts()
total_samples = len(df)
prior_probabilities = sentiment_counts / total_samples

print("Prior Probabilities:")
print(prior_probabilities)

**Menghitung Probabilitas Kondisional (P(wi|C)P(w i|C))**

In [None]:
# menghitung probabilitas kondisional dengan smoothing Laplace

total_positive = frequency_positive.sum()
total_negative = frequency_negative.sum()

probability_conditional_positive = (frequency_positive + 1) / (total_positive + len(features))
probability_conditional_negative = (frequency_negative + 1) / (total_negative + len(features))

print("Probability Conditional Positive:")
print(probability_conditional_positive)
print("Probability Conditional Negative:")
print(probability_conditional_negative)

**Menghitung Probabilitas Posterior (P(C|w)P(C|w)) untuk setiap Dokumen**

In [None]:
# menghitung probabilitas posterior untuk setiap dokumen
def calculate_posterior_probabilities(document):
  words = document.split()
  posterior_positive = prior_probabilities['Positif']
  posterior_negative = prior_probabilities['Negatif']

  for word in words:
    if word in features:
      posterior_positive *= probability_conditional_positive[word]
      posterior_negative *= probability_conditional_negative[word]
  return {'Positif': posterior_positive, 'Negatif': posterior_negative}

df['posterior_probabilities'] = df['steming_data'].apply(calculate_posterior_probabilities)
df.head()

# ***NAIVE BAYES MULTINOMIAL***

In [None]:
df.info()

In [None]:
df.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

# preprocessing data
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(df['steming_data']).toarray()
y = df['Sentiment']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize and train model MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train, y_train)

# predict with MultinomialLB
y_pred_mnb = mnb.predict(X_test)

# evaluate MultinomialNB
conf_matrix_mnb = confusion_matrix(y_test, y_pred_mnb)
class_report_mnb = classification_report(y_test, y_pred_mnb)
accuracy_mnb = accuracy_score(y_test, y_pred_mnb)

print("Results")
print("=====================================")
print("Confusion Matrix :")
print(conf_matrix_mnb)
print("=====================================")
print("\nClassification Report :")
print(class_report_mnb)
print("=====================================")
print(f"Accuracy : {accuracy_mnb:.4f}")
print("=====================================")

# plot confusion matrix for MultinomialNB
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_mnb, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# create DataFrame for actual vs predicted (MultinomialNB)
results_mnb = pd.DataFrame({'steming_data': df.loc[y_test.index, 'steming_data'], 'Actual': y_test, 'Predicted': y_pred_mnb})
results_mnb.to_csv("hasil_prediksi.csv", encoding='utf8', index=False)
print("Actual vs Predicted :")
results_mnb.head()

**Menampilkan Jumlah Analisis Data Aktual**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sentiment_count = results_mnb['Actual'].value_counts()
sns.set_style('whitegrid')

fig, ax = plt.subplots(figsize=(4, 2))
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
plt.title('Hasil Analisis Data Actual', fontsize=14, pad=20)
plt.xlabel('Class Actual', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)

total = len(results_mnb['Actual'])

for i, count in enumerate(sentiment_count.values):
  percentage = f'{100 * count / total:.2f}%'
  ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')

plt.show()

**Menampilkan Jumlah Analisis Data Prediksi**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sentiment_count = results_mnb['Predicted'].value_counts()
sns.set_style('whitegrid')

fig, ax = plt.subplots(figsize=(4, 2))
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
plt.title('Hasil Analisis Data Predicted', fontsize=14, pad=20)
plt.xlabel('Class Predicted', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)

total = len(results_mnb['Predicted'])

for i, count in enumerate(sentiment_count.values):
  percentage = f'{100 * count / total:.2f}%'
  ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')

plt.show()

# ***NAIVE BAYES GAUSSIAN***

In [None]:
df.info()

In [None]:
df.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

# preprocessing data
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(df['steming_data']).toarray()
y = df['Sentiment']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize and train model gaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# predict with GaussianNB
y_pred_gnb = gnb.predict(X_test)

# evaluate GaussianNB
conf_matrix_gnb = confusion_matrix(y_test, y_pred_gnb)
class_report_gnb = classification_report(y_test, y_pred_gnb)
accuracy_gnb = accuracy_score(y_test, y_pred_gnb)

print("GaussianNB Results")
print("=====================================")
print("Confusion Matrix (GaussianNB):")
print(conf_matrix_gnb)
print("=====================================")
print("\nClassification Report (GaussianNB):")
print(class_report_gnb)
print("=====================================")
print(f"Accuracy (GaussianNB): {accuracy_gnb:.4f}")
print("=====================================")

# plot confusion matrix for GaussianNB
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_gnb, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (GaussianNB)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# create DataFrame for actual vs predicted (GaussianNB)
results_gnb = pd.DataFrame({'steming_data': df.loc[y_test.index, 'steming_data'], 'Actual': y_test, 'Predicted': y_pred_gnb})
results_gnb.to_csv("hasil_prediksi_gaussian.csv", encoding='utf8', index=False)
print("Actual vs Predicted (GaussianNB):")
results_gnb.head()

**Menampilkan Jumlah Analisis Data Aktual**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sentiment_count = results_gnb['Actual'].value_counts()
sns.set_style('whitegrid')

fig, ax = plt.subplots(figsize=(4, 2))
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
plt.title('Hasil Analisis Data Actual', fontsize=14, pad=20)
plt.xlabel('Class Actual', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)

total = len(results_gnb['Actual'])

for i, count in enumerate(sentiment_count.values):
  percentage = f'{100 * count / total:.2f}%'
  ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')

  plt.show()

**Menampilkan Jumlah Analisis Data Prediksi**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sentiment_count = results_gnb['Predicted'].value_counts()
sns.set_style('whitegrid')

fig, ax = plt.subplots(figsize=(4, 2))
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
plt.title('Hasil Analisis Data Predicted', fontsize=14, pad=20)
plt.xlabel('Class Predicted', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)

total = len(results_gnb['Predicted'])

for i, count in enumerate(sentiment_count.values):
  percentage = f'{100 * count / total:.2f}%'
  ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')

  plt.show()

# **NAIVE BAYES BERNOULLI**

In [None]:
df.info()

In [None]:
df.head()

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt

# preprocessing data
tfidf = TfidfVectorizer()
x = tfidf.fit_transform(df['steming_data']).toarray()
y = df['Sentiment']

# split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# initialize and train model BernoulliNB
brn = BernoulliNB()
brn.fit(X_train, y_train)

# predict with BernoulliNB
y_pred_brn = brn.predict(X_test)

# evaluate BernoulliNB
conf_matrix_brn = confusion_matrix(y_test, y_pred_brn)
class_report_brn = classification_report(y_test, y_pred_brn)
accuracy_brn = accuracy_score(y_test, y_pred_brn)

print("BernoulliNB Results")
print("=====================================")
print("Confusion Matrix (BernoulliNB):")
print(conf_matrix_brn)
print("=====================================")
print("\nClassification Report (BernoulliNB):")
print(class_report_brn)
print("=====================================")
print(f"Accuracy (BernoulliNB): {accuracy_brn:.4f}")
print("=====================================")

# plot confusion matrix for BernoulliNB
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix_brn, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix (BernoulliNB)')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# create DataFrame for actual vs predicted (BernoulliNB)
results_brn = pd.DataFrame({'steming_data': df.loc[y_test.index, 'steming_data'], 'Actual': y_test, 'Predicted': y_pred_brn})
results_brn.to_csv("hasil_prediksi_multinomial.csv", encoding='utf8', index=False)
print("Actual vs Predicted (BernoulliNB):")
results_brn.head()

**Menampilkan Jumlah Analisis Data Aktual**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sentiment_count = results_brn['Actual'].value_counts()
sns.set_style('whitegrid')

fig, ax = plt.subplots(figsize=(4, 2))
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
plt.title('Jumlah Analisis Data Actual', fontsize=14, pad=20)
plt.xlabel('Class Actual', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)

total = len(results_brn['Actual'])

for i, count in enumerate(sentiment_count.values):
  percentage = f'{100 * count / total:.2f}%'
  ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')

  plt.show()

**Menampilkan Jumlah Analisis Data Prediksi**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

sentiment_count = results_brn['Predicted'].value_counts()
sns.set_style('whitegrid')

fig, ax = plt.subplots(figsize=(4, 2))
ax = sns.barplot(x=sentiment_count.index, y=sentiment_count.values, palette='pastel')
plt.title('Jumlah Analisis Data Predicted', fontsize=14, pad=20)
plt.xlabel('Class Predicted', fontsize=12)
plt.ylabel('Jumlah Data', fontsize=12)

total = len(results_brn['Predicted'])

for i, count in enumerate(sentiment_count.values):
  percentage = f'{100 * count / total:.2f}%'
  ax.text(i, count + 0.10, f'{count}\n({percentage})', ha='center', va='bottom')

  plt.show()