<a href="https://colab.research.google.com/github/karimahagstn/latihan_1/blob/main/MachineLearning_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
def to_lowercase(text):
    return text.lower()

# Contoh penggunaan
sample_text = "berbayar semua."
print(to_lowercase(sample_text))


berbayar semua.


In [2]:
import string

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Contoh penggunaan
sample_text = "Hay Illy!"
print(remove_punctuation(sample_text))


Hay Illy


In [3]:
import re

def remove_numbers(text):
    return re.sub(r'\d+', '', text)

# Contoh penggunaan
sample_text = "aku suka bakso dan mie ayam."
print(remove_numbers(sample_text))


aku suka bakso dan mie ayam.


In [4]:
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt_tab')

def tokenize(text):
    return word_tokenize(text)

# Contoh penggunaan
sample_text = "aku bahagia"
print(tokenize(sample_text))


['aku', 'bahagia']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [5]:
from nltk.corpus import stopwords

nltk.download('stopwords')

def remove_stopwords(words):
    stop_words = set(stopwords.words('english'))
    return [word for word in words if word not in stop_words]

# Contoh penggunaan
sample_text = "kita pasti bisa"
tokenized_text = tokenize(sample_text)
print(remove_stopwords(tokenized_text))


['kita', 'pasti', 'bisa']


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from nltk.stem import PorterStemmer

def stem_words(words):
    ps = PorterStemmer()
    return [ps.stem(word) for word in words]

# Contoh penggunaan
sample_text = "ayo tetap semangat."
tokenized_text = tokenize(sample_text)
filtered_words = remove_stopwords(tokenized_text)
print(stem_words(filtered_words))


['ayo', 'tetap', 'semangat', '.']


In [7]:
sentence1 = "aku suka sepak bola"
sentence2 = "jude bellingham idola ku"
sentence3 = "dia sangat berbakat dan tampan"

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
docs = [sentence1, sentence2, sentence3]
print(docs)

['aku suka sepak bola', 'jude bellingham idola ku', 'dia sangat berbakat dan tampan']


In [9]:
#Mendefinisikan dan menyesuaikan count vectorizer pada dokumen.

vec = CountVectorizer()
X = vec.fit_transform(docs)
#Mengonversi vektor pada DataFrame menggunakan pandas

df = pd.DataFrame(X.toarray(),
    columns=vec.get_feature_names_out())
df.head()

Unnamed: 0,aku,bellingham,berbakat,bola,dan,dia,idola,jude,ku,sangat,sepak,suka,tampan
0,1,0,0,1,0,0,0,0,0,0,1,1,0
1,0,1,0,0,0,0,1,1,1,0,0,0,0
2,0,0,1,0,1,1,0,0,0,1,0,0,1


In [10]:
import numpy as np
from collections import Counter
from math import log

# Tiga dokumen dalam korpus
documents = [
    "The quick brown fox jumps over the lazy dog",
    "The lazy dog sleeps in the sun"
    ]

# Preprocessing: Lowercasing and tokenizing
tokenized_documents = [doc.lower().split() for doc in documents]

# Menghitung TF
def compute_tf(tokenized_doc):
    tf_dict = {}
    term_count = Counter(tokenized_doc)
    total_terms = len(tokenized_doc)
    for term, count in term_count.items():
        tf_dict[term] = count / total_terms
    return tf_dict

tf_list = [compute_tf(doc) for doc in tokenized_documents]

print("Term Frequency (TF):")
for idx, tf in enumerate(tf_list):
    print(f"Document {idx + 1} TF:")
    for term, score in tf.items():
        print(f"    {term}: {score:.4f}")


Term Frequency (TF):
Document 1 TF:
    the: 0.2222
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.1111
    dog: 0.1111
Document 2 TF:
    the: 0.2857
    lazy: 0.1429
    dog: 0.1429
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [11]:
# Menghitung IDF
def compute_idf(tokenized_docs):
    idf_dict = {}
    total_docs = len(tokenized_docs)
    all_terms = set(term for doc in tokenized_docs for term in doc)
    for term in all_terms:
        doc_containing_term = sum(1 for doc in tokenized_docs if term in doc)
        idf_dict[term] = log(total_docs / (1 + doc_containing_term)) + 1
    return idf_dict

idf_dict = compute_idf(tokenized_documents)

print("\nInverse Document Frequency (IDF):")
for term, score in idf_dict.items():
    print(f"    {term}: {score:.4f}")



Inverse Document Frequency (IDF):
    the: 0.5945
    quick: 1.0000
    jumps: 1.0000
    fox: 1.0000
    in: 1.0000
    lazy: 0.5945
    over: 1.0000
    dog: 0.5945
    brown: 1.0000
    sleeps: 1.0000
    sun: 1.0000


In [12]:
# Menghitung TF-IDF
def compute_tfidf(tf_list, idf_dict):
    tfidf_list = []
    for tf in tf_list:
        tfidf_dict = {}
        for term, tf_value in tf.items():
            tfidf_dict[term] = tf_value * idf_dict.get(term, 0)
        tfidf_list.append(tfidf_dict)
    return tfidf_list

tfidf_list = compute_tfidf(tf_list, idf_dict)

print("\nTF-IDF:")
for idx, tfidf in enumerate(tfidf_list):
    print(f"Document {idx + 1} TF-IDF:")
    for term, score in tfidf.items():
        print(f"    {term}: {score:.4f}")


TF-IDF:
Document 1 TF-IDF:
    the: 0.1321
    quick: 0.1111
    brown: 0.1111
    fox: 0.1111
    jumps: 0.1111
    over: 0.1111
    lazy: 0.0661
    dog: 0.0661
Document 2 TF-IDF:
    the: 0.1699
    lazy: 0.0849
    dog: 0.0849
    sleeps: 0.1429
    in: 0.1429
    sun: 0.1429


In [13]:
pip install gensim



In [14]:
!pip install -U numpy gensim


Collecting numpy
  Using cached numpy-2.2.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)


In [16]:
from gensim.models import Word2Vec
import numpy as np

corpus = [
    'aku hebat',
    'aku bisa sampai akhir',
    'tetap semnagat'
]

sentences = [doc.split() for doc in corpus]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

def document_vector(doc):
    return np.mean([model.wv[word] for word in doc.split() if word in model.wv], axis=0)

doc_vectors = [document_vector(doc) for doc in corpus]
print(doc_vectors)


[array([ 3.79802217e-03, -2.11045146e-03,  2.01749615e-03,  5.00781881e-03,
       -4.74703172e-03, -2.98431586e-03,  6.28636684e-03,  4.47635865e-03,
       -4.13069688e-03, -2.63705011e-03,  6.63890177e-03, -9.68459062e-06,
       -2.63043772e-03,  7.94364978e-03, -4.89072222e-03, -1.32721360e-03,
        6.02599559e-03,  3.87065066e-03, -3.39117949e-03, -9.16568935e-03,
        4.23025619e-03,  1.39100326e-03,  8.06296524e-03,  9.86396684e-04,
        3.92047688e-03, -4.99477959e-04, -1.39120407e-03,  3.84469749e-04,
       -3.64460400e-03, -2.97514210e-03, -4.55324538e-04,  4.00504051e-03,
        4.43168217e-03, -2.17107590e-03, -4.22071153e-03, -1.19208125e-04,
        5.75602287e-04, -7.31246127e-03, -2.92752008e-03, -6.85510458e-03,
       -1.16297789e-03, -3.82369151e-04, -2.41616741e-04, -5.81768528e-03,
        1.69328752e-03,  4.68940893e-03, -7.72334402e-03, -1.65157020e-04,
        3.26455804e-04,  3.27500631e-03, -4.21460345e-03, -2.16787658e-03,
       -6.37731701e-03, 

In [17]:
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn import metrics

# 1. Mengumpulkan data
newsgroups = fetch_20newsgroups(subset='all')

# 2. Preprocessing data
# Tidak perlu preprocessing khusus karena kita akan menggunakan TfidfVectorizer

# 3. Membagi data menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(newsgroups.data, newsgroups.target, test_size=0.25, random_state=42)

# 4. Melatih model
# Membuat pipeline yang mencakup TfidfVectorizer dan MultinomialNB
model = make_pipeline(TfidfVectorizer(), MultinomialNB())

# Melatih model menggunakan training set
model.fit(X_train, y_train)

# 5. Mengevaluasi model
# Prediksi pada testing set
y_pred = model.predict(X_test)

# Evaluasi kinerja model
print(f"Accuracy: {metrics.accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(metrics.classification_report(y_test, y_pred, target_names=newsgroups.target_names))

# Confusion Matrix
print("Confusion Matrix:")
print(metrics.confusion_matrix(y_test, y_pred))


Accuracy: 0.8425297113752123
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.88      0.72      0.79       198
           comp.graphics       0.86      0.79      0.82       245
 comp.os.ms-windows.misc       0.88      0.83      0.85       242
comp.sys.ibm.pc.hardware       0.66      0.86      0.75       238
   comp.sys.mac.hardware       0.95      0.84      0.89       250
          comp.windows.x       0.96      0.80      0.87       260
            misc.forsale       0.96      0.66      0.78       241
               rec.autos       0.89      0.93      0.91       244
         rec.motorcycles       0.91      0.95      0.93       219
      rec.sport.baseball       0.96      0.94      0.95       261
        rec.sport.hockey       0.90      0.98      0.94       245
               sci.crypt       0.78      0.98      0.87       251
         sci.electronics       0.92      0.80      0.86       249
                 sci.me