In [1]:
# Packages importieren
import pandas as pd
import numpy as np
import nltk
import string
import os
from sklearn.feature_extraction.text import CountVectorizer

## Naive Bayes

In [2]:
posts = pd.read_csv('posts_italienisch_deutsch.csv', sep=';')
posts.head(n=5)

Unnamed: 0,post,type
0,"Wenn ich meine Freundin zum Essen einlade, geh...",I
1,Mein Lieblingsrestaurant hat eine gut bürgerli...,D
2,Pizza und Wein und alles ist fein!,I
3,Ich mag italienische Pizza und vor allem Wein,I
4,Für mich besteht ein gutes Essen aus einem gro...,D


In [3]:
# Preprocessing

# Zeichensetzung
def remove_punctuation(text):
    return "".join([char for char in text if char not in string.punctuation])
posts.post = posts.post.apply(remove_punctuation)
# Kleinschreibung
posts.post = posts.post.apply(str.lower)
# Whitespace
posts.post = posts.post.apply(str.strip)

posts.head(n=5)

Unnamed: 0,post,type
0,wenn ich meine freundin zum essen einlade gehe...,I
1,mein lieblingsrestaurant hat eine gut bürgerli...,D
2,pizza und wein und alles ist fein,I
3,ich mag italienische pizza und vor allem wein,I
4,für mich besteht ein gutes essen aus einem gro...,D


In [4]:
# Document Term Matrix
def create_dtm(docs, binary=False):

    vec = CountVectorizer(binary=binary)
    X = vec.fit_transform(docs)
    dtm = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out())
    # transponiere matrix
    #dtm = tdm.transpose()
    return dtm

In [5]:
dtm = create_dtm(posts.post, binary=False)
dtm.head(n=5)

Unnamed: 0,ach,allem,allen,alles,als,am,anfangen,auch,auf,aus,...,vor,vorspeise,was,wein,wenn,wichtig,wir,zu,zum,über
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,1,0,1,0,2,0
1,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [6]:
# Aufteilung in Trainings- und Testdaten
posts_training = dtm.iloc[0:6]
posts_test = dtm.iloc[6:14]
# Trainingslabels
posts_traininglabel = pd.factorize(posts.type.iloc[0:6])[0]
# Testlabels
posts_testlabel = pd.factorize(posts.type.iloc[6:14])[0]

In [7]:
from sklearn.naive_bayes import GaussianNB

In [8]:
# Training
classifier = GaussianNB()
classifier.fit(posts_training, posts_traininglabel)

GaussianNB()

In [9]:
# Prediction
y_pred = classifier.predict(posts_test)
y_pred

array([0, 0, 1, 1, 0, 1, 1, 1], dtype=int64)

In [10]:
posts_testlabel

array([0, 0, 1, 1, 0, 1, 0, 1], dtype=int64)

In [11]:
from sklearn.metrics import confusion_matrix,accuracy_score

cm = confusion_matrix(posts_testlabel, y_pred)
ac = accuracy_score(posts_testlabel,y_pred)

In [12]:
print(cm)
print(ac)

[[3 1]
 [0 4]]
0.875


## K-Nearest-Neighbour

In [13]:
#Separates Einlesen der Datensätze für positiv und negativ bewertete Filme
pos_movie_dir = 'pos'
neg_movie_dir = 'neg'
#Funktion zum Einlesen der Textdateien
def read_txt_in_dir(directory):
    dir_txt = []
    for filename in os.listdir(directory):
        with open(os.path.join(directory, filename)) as file:
            txt = file.read()
            dir_txt.append(txt)
    return dir_txt
#Speichern der Daten in einer Liste
pos_movies = read_txt_in_dir(pos_movie_dir)
neg_movies = read_txt_in_dir(neg_movie_dir)
#Alle Daten in einer Tabelle zusammenfassen
all_movies = pd.DataFrame({'pos_movies': pos_movies, 'neg_movies': neg_movies})


In [14]:
# Preprocessing

# Zeichensetzung und Zahlen entfernen
def remove_punctuation(text):
    return "".join([char for char in text if char not in string.punctuation and not char.isdigit()])
all_movies = all_movies.applymap(remove_punctuation)
# Kleinschreibung
all_movies = all_movies.applymap(str.lower)
# Whitespace
all_movies = all_movies.applymap(str.strip)
# Stopwords

In [15]:
pos_movies_processed = all_movies.pos_movies
neg_movies_processed = all_movies.neg_movies

In [129]:
# Jeweils 100 positive und negative auswählen
selected_movies = np.concatenate([pos_movies_processed[0:200].values, neg_movies_processed[0:200].values])
movie_labels = ['positive']*200 + ['negative']*200
movie_labels_factorized = pd.factorize(movie_labels)[0]

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words={'english'}, min_df = 0, analyzer='word')
X = vectorizer.fit_transform(selected_movies)

In [133]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, movie_labels_factorized, random_state=10, stratify=movie_labels_factorized)

In [136]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=12)

In [137]:
# Training
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=12)

In [143]:
from sklearn.metrics import mean_squared_error, precision_score, accuracy_score
from math import sqrt

y_pred = knn.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = sqrt(mse)
print('RMSE: ', rmse)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: ', accuracy)

precision = precision_score(y_test, y_pred, average=None)
# Spalte 1: positive Reviews
print('Precision: ', precision)

RMSE:  0.6164414002968976
Accuracy:  0.62
Precision:  [0.58571429 0.7       ]


In [153]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.82      0.68        50
           1       0.70      0.42      0.53        50

    accuracy                           0.62       100
   macro avg       0.64      0.62      0.60       100
weighted avg       0.64      0.62      0.60       100



## Neuronales Netz

In [150]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(activation='relu')
mlp.fit(X_train, y_train)

MLPClassifier()

In [151]:
y_pred_nn = mlp.predict(X_test)

In [152]:
from sklearn.metrics import classification_report,confusion_matrix

print(classification_report(y_test, y_pred_nn))

              precision    recall  f1-score   support

           0       0.85      0.80      0.82        50
           1       0.81      0.86      0.83        50

    accuracy                           0.83       100
   macro avg       0.83      0.83      0.83       100
weighted avg       0.83      0.83      0.83       100

