In [7]:
import os
import snowballstemmer
import string
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from pathlib import Path

In [8]:
def preprocess_text(text):
    print("Original text:")
    print(text)
    
    # Removes punctuation marks
    text = text.translate(str.maketrans(string.punctuation, ' ' * len(string.punctuation)))
    
    # Converts all words in the text to lowercase
    text = text.lower()
    
    # Turkish stemming process
    stemmer = snowballstemmer.TurkishStemmer()
    tokens = text.split()
    stemmed_words = [stemmer.stemWord(word) for word in tokens]
    
    # Combines the preprocessed text
    processed_text = " ".join(stemmed_words)
    
    print("Processed text:")
    print(processed_text)
    print("\n")
    
    return processed_text

In [9]:
def load_data(path):
    
    data = []
    labels = []
    for label, folder in enumerate(['negative', 'neutral', 'positive']):
        folder_path = os.path.join(path, folder)
        for filename in os.listdir(folder_path):
            with open(os.path.join(folder_path, filename), 'r') as file:
                data.append(preprocess_text(file.read()))
                labels.append(label + 1)  
    return data, labels


In [10]:
def tfidf_knn(data, labels, k=8):
    print("Training TF-IDF and KNN model...")
    
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)
    
    # Applying KNN model with Cosine metric
    knn = KNeighborsClassifier(n_neighbors=k, metric='cosine')
    knn.fit(tfidf_matrix, labels)
    
    # Making predictions
    predictions = knn.predict(tfidf_matrix)
    
    # Calculating TP, FP, FN using Confusion Matrix
    cm = confusion_matrix(labels, predictions)
    
    class_names = ['negative', 'neutral', 'positive']
    
    # Printing TP, FP, FN values for each class
    for i, class_name in enumerate(class_names):
        TP = cm[i, i]
        FN = cm[i, :].sum() - TP
        FP = cm[:, i].sum() - TP
        print(f"Class: {class_name} (Label: {i+1})")
        print(f"  TP: {TP}, FN: {FN}, FP: {FP}")
    
    # Calculating F1 score
    scores = cross_val_score(knn, tfidf_matrix, labels, cv=10, scoring='f1_macro')
    
    return scores.mean()

In [21]:
def tfidf_to_csv(data, labels, output_file="tfidf_output.csv"):
    # TF-IDF vectorization
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(data)

    # Convert TF-IDF matrix to DataFrame
    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())

    # Add class labels as a new column
    tfidf_df['Class'] = labels

    # Save the DataFrame to a CSV file
    tfidf_df.to_csv(output_file)

    print(f"TF-IDF table saved to {output_file}")

In [22]:
if __name__ == "__main__":
    folder_path = r"C:\Users\İBRAHİM HALİL\Assignment\Raw_Texts"
    data, labels = load_data(folder_path)
    
    tfidf_to_csv(data, labels, output_file="tfidf_output.csv")

    # TF-IDF ve KNN modelini çalıştır ve başarı skorunu yazdır
    tfidf_knn_score = tfidf_knn(data, labels, k=8)
    print(f"TF-IDF ve KNN modeli başarı skoru (F1-macro): {tfidf_knn_score:.4f}")

Original text:
hayir!
Processed text:
hayir


Original text:
#turkcell adinda bir telefon operatoru var.deli dumrul dan besbin beter.Para carpmanin adi #turkcell olmus.eline ver kolunu kurtaramiyorsun
Processed text:
turkcell ad bir telefo operator var del dumrul dan besp beter par carpman adi turkcell olmus el ver kol kurtaramiyor


Original text:
Dileğim; SuperOnline, Turkcell, TTNET vb. firmaların 2012'de "sizi arayacağız" demek yerine direk burdan cevap vermesi şeklinde. #hope #2012
Processed text:
dilek superonl turkcell ttnet vb firma 2012 de siz arayacak demek yer direk bur cevap vermes şekl hope 2012


Original text:
Mesaj hakkı yapamıyorum,turkcell yok olsun istiyorum şuan
Processed text:
mesaj hakkı yapamıyor turkcell yok ol istiyor şuan


Original text:
Turksel amca yapmiyo artik sana oyle kiyaaklar kankı
Processed text:
turksel am yapmiyo artik sa oyle kiyaak kankı


Original text:
TÜRKCELL SÜPERONLİNE ANADOLU YAKASINDAKİ SORUNUNU 2012 ÇÖZER Mİ ACABA.
Processed text:
türkce