# KI Challenge 

# Dokumentenklassifizierung

## Lena Hinkel, lena.hinkel@gmx.de, 1981095
## Lars Böcking, boecking.lars@googlemail.com, 2000264
## Mariana Zehender, marianazehender@aol.com, 2056046
## Hongchen Ji, hongchenji@gmail.com, 1962863
## Xiaoyu Yang, yxy6677@gmail.com, 2132536

In [3]:
import random 
import pandas as pd
import numpy as np
from pandas import DataFrame
import csv
from spacy.tokenizer import Tokenizer

ModuleNotFoundError: No module named 'spacy'

## Import Data - Negativ und Positiv gekennzeichnete Rezensionen für das Trainieren des Algorithmus 


In [2]:
def getTrainingSets():
    with open("pos.txt") as file:
        dataPos = file.readlines()
    with open("neg.txt") as file: 
        dataNeg = file.readlines()
        
    return dataPos, dataNeg

In [3]:
def getEvaluationsSet():
    with open("evaluation.txt") as file:
        evaluation = file.readlines()
    return evaluation

## Vorbereitungen für NLP (Natural Language Processing) = Stemming and Lemmatiization

### Erstmal Preprocessing
#### (Aufbereiten und Analysieren von unstruktuierten Texten - Siehe Datensätze dataPos und dataNeg)

Lemmatisierung: Reduziere ein Wort – mittels eines festgelegten Korpus – auf den Wortstamm.
traf, trifft, treffen, treffe à treffen (aus Übung 1)

Wortfilterung (Stopwords removal: Lösche Stoppwörter, z.B. der, die, das, ein, über Stopplisten im Internet (aus Übung 1) - Lösche Zahlen und Satzzeichen --> Ersetze diese Zeichen mit Leerzeichen

Synonyme: Ersetze Wörter mit derselben Bedeutung mit einem identischen Synonym. Meist mittels lexikalischer Datenbank, wie bspw. Wordnet (aus Übung 1) 



In [4]:
import spacy 

def datenAufbereitung(datensatz): 
    # 1 Stopwords Removal
    documentLen = len(datensatz)

    intab = "!\"#$%&'()*+, -./:;<=>?@[\]^_`{|}~1234567890" 
    outtab = ""
    for i in range(len(intab)):
        outtab += " "

    for i in range(documentLen): 
        datensatz[i] = datensatz[i].translate(str.maketrans(intab, outtab))
        datensatz[i]=datensatz[i].lower()

    sp = spacy.load('en_core_web_sm', disable=["parser", "ner", "tagger"])
    token = Tokenizer(sp.vocab)
    
    for i in range(documentLen):
        datensatz[i] = token(datensatz[i])
        cache = ""
        for words in datensatz[i]:
             cache += " " +words.lemma_            
        datensatz[i] = cache

    return datensatz

## aussortierter Code

print("erster Versuch lemmatisierung")

"""# 2 Lemmatisierung 
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print(dataPos[0])

for i in range(documentLen):
    cache = ""
    for word in dataPos[i].split():
        cache += " " +lemmatizer.lemmatize(word)
    dataPos[i] = cache
    
print(dataPos[0])
"""

In [5]:
def combineDataSets(set1, set2):
    dataCombined = []

    for rec in set1:
        dataCombined.append(str(rec))
    for rec in set2:
        dataCombined.append(str(rec))
        
    return dataCombined

### Vektorisierung


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer #tfidf steht für Term Frequency-Inverse Document Frequency

def vectorizer(dataset):
    
    corpus = dataset

    cv = CountVectorizer()
    word_count_vector = cv.fit_transform(corpus)

    vectorizer = TfidfVectorizer()
    matrix = vectorizer.fit_transform(corpus)
    tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
    tfidf_transformer.fit(word_count_vector)
    
    df_idf = pd.DataFrame(tfidf_transformer.idf_, index=cv.get_feature_names(),columns=["idf_weights"])
    df_idf.sort_values(by=['idf_weights'])
    
    # count matrix
    count_vector= cv.transform(dataset)
 
    # tf-idf scores
    tf_idf_vector= tfidf_transformer.transform(count_vector)
    
    return cv, tfidf_transformer, tf_idf_vector

in "tf_idf_vector" haben wir alle 5007 reviews in Form der tf_idf abgespeichert. Das bedeutet, dass wir jedes Review auf einen Vektor der Größe 10158 abbilden 

In [7]:
from sklearn.model_selection import train_test_split

def splitDataSet(tf_idf_vector):
    
    x_train = tf_idf_vector.toarray()
    y_train = []

    for i in range(len(dataPos)):
        y_train.append(1)
    for i in range(len(dataNeg)):
        y_train.append(0)
        
    X_train, X_test, Y_train, Y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=1)

    return X_train, X_test, Y_train, Y_test

In [8]:
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.linear_model import PassiveAggressiveClassifier, RidgeClassifier
from sklearn.metrics import accuracy_score

def trainClassifier(X_train, X_test, Y_train, Y_test):
    
    #clf = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
    #clf = Perceptron(tol=1e-3, random_state=0)
    #clf = RidgeClassifier().fit(X_train, Y_train)
    clf = LogisticRegression()
    clf.fit(X_train, Y_train)

    predictions = clf.predict(X_test)
    print(f'Accuracy: {round(accuracy_score(Y_test, predictions),4)}')
    
    return clf

In [12]:
def predicEvaluation(csf, cv, evaluation, tf_idf_vector_eval):
    # count matrix
    count_vector= cv.transform(evaluation)
 
    # tf-idf scores
    tf_idf_vector_eval = tfidf_transformer.transform(count_vector)
    results = csf.predict(tf_idf_vector_eval)
    return results

          
# Aussortierter Code
import keras

from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.optimizers import Adam

print(len(X_train))
input_shape = X_train[0].shape

model = Sequential()
model.add(Dense(100, input_shape=input_shape))
model.add(Activation('relu'))

model.add(Dense(10, input_shape=input_shape))
model.add(Activation('relu'))

model.add(Dense(1))
model.add(Activation('softmax'))
          
model.compile(loss='binary_crossentropy',
              optimizer="Adam",
              metrics=['accuracy'])

model.fit(X_train, Y_train,
              batch_size=1,
              epochs=10,
              validation_data=(X_test, Y_test),
              shuffle=True)
    
scores = model.evaluate(X_test, Y_test, verbose=1)
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])
"""

In [23]:
def savePredictions(results):
    with open('predictionsEvaluation.csv', mode='w+', newline='') as pred_file:
        pred_file= csv.writer(pred_file, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        i = 1
        pred_file.writerow(["Evaluationsdatensatz", "Prediction"])
        for pred in results:
            pred_file.writerow([i, pred])
            i+= 1
    return pred_file

In [24]:
dataPos, dataNeg= getTrainingSets()

dataPosBereinigt = datenAufbereitung(dataPos)
dataNegBereinigt = datenAufbereitung(dataNeg)

trainSet = combineDataSets(dataPosBereinigt, dataNegBereinigt)

cv, tfidf_transformer, tf_idf_vector = vectorizer(trainSet)
X_train, X_test, Y_train, Y_test = splitDataSet(tf_idf_vector)
csf = trainClassifier(X_train, X_test, Y_train, Y_test)

evaluation = getEvaluationsSet()
evaluation = datenAufbereitung(evaluation)

predictions = predicEvaluation(csf, cv, evaluation, tfidf_transformer)

savePredictions(predictions)




Accuracy: 0.8461


<_csv.writer at 0x1a26f68ef0>