In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import shuffle, resample

## Load data 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
news = pd.read_csv("/content/drive/MyDrive/Masterarbeit/Crawler/data/FinalDataset/Ablation02/Stepwords_CleanData_FinalDataset_ForModel_Ablation2.csv", index_col=0)

In [None]:
# Get the same train and test data
def split_df_in_train_test(df):
    df = df.reset_index()
    split_point = int(np.round(df.shape[0]) * 0.8)
    df_train = df.loc[:split_point-1,:]
    df_test = df.loc[split_point:,:]
    return df_train, df_test

In [None]:
n_samples = 6000

In [None]:
left_FE = resample(shuffle(news[(news["label"]=="links") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
center_FE = resample(shuffle(news[(news["label"]=="central") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
right_FE = resample(shuffle(news[(news["label"]=="rechts") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)

In [None]:
train = pd.concat([split_df_in_train_test(left_FE)[0], \
    split_df_in_train_test(center_FE)[0], \
    split_df_in_train_test(right_FE)[0]])

In [None]:
test =  pd.concat([split_df_in_train_test(left_FE)[1], \
    split_df_in_train_test(center_FE)[1], \
    split_df_in_train_test(right_FE)[1]])

In [None]:
X_train, y_train = train["NeuGenerierterText_str"], train["label"]
X_test, y_test = test["NeuGenerierterText_str"], test["label"]

# Creating German Stopwords

In [None]:
!pip install stop-words

Collecting stop-words
  Downloading https://files.pythonhosted.org/packages/1c/cb/d58290804b7a4c5daa42abbbe2a93c477ae53e45541b1825e86f0dfaaf63/stop-words-2018.7.23.tar.gz
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-cp36-none-any.whl size=32919 sha256=78d3491bf8077b0c8c0f7a6078e2842acd5bce4322b5eb62ba89faee194153e0
  Stored in directory: /root/.cache/pip/wheels/75/37/6a/2b295e03bd07290f0da95c3adb9a74ba95fbc333aa8b0c7c78
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [None]:
from stop_words import get_stop_words

#stop_words = get_stop_words('de')
stop_words = get_stop_words('german')

## Generate TFIDF vector

### Bigram:

In [None]:
bigram_vec = TfidfVectorizer(stop_words=stop_words, max_features=30000, ngram_range=(1, 2))

X_train_bi = bigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_bi = bigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

### Trigram

In [None]:
trigram_vec = TfidfVectorizer(stop_words=stop_words, max_features=30000, ngram_range=(1, 3))

X_train_tri = trigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_tri = trigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

## Generate LabelEncoder

In [None]:
label_enc = LabelEncoder()
y_train_enc = label_enc.fit_transform(y_train)
y_test_enc = label_enc.fit_transform(y_test)

In [None]:
label_enc.inverse_transform([0, 1, 2]) 

array(['central', 'links', 'rechts'], dtype=object)

In [None]:
label = [0, 1, 2]
target_label = ['central', 'links', 'rechts']

## Apply classifiers

In [None]:
def run_classifier(clf, X_train, X_test, y_train, y_test, label, target_label):
    
    print("Training of the classifier: {} \n".format(clf))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\n")

    print("Accuracy of the classifier:     ")
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)

    print("\n")

    print("Confusion Matrix of the classifier: \n")
    con_mat = confusion_matrix(y_test, y_pred, labels=label)
    print(con_mat)

    print("\n")

    print("Classification Report of the classifier: \n")
    report = classification_report(y_test, y_pred, target_names=target_label)
    print(report)

In [None]:
dt = DecisionTreeClassifier(random_state=42)
svc = LinearSVC()
lr = LogisticRegression(multi_class="multinomial", solver="saga")
nb = BernoulliNB()

### ...using with bigrams

#### Decision Tree

In [None]:
run_classifier(dt, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.7533333333333333


Confusion Matrix of the classifier: 

[[762 222 216]
 [133 989  78]
 [175  64 961]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.71      0.64      0.67      1200
       links       0.78      0.82      0.80      1200
      rechts       0.77      0.80      0.78      1200

    accuracy                           0.75      3600
   macro avg       0.75      0.75      0.75      3600
weighted avg       0.75      0.75 

#### Naive Bayes

In [None]:
run_classifier(nb, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.7175


Confusion Matrix of the classifier: 

[[905 150 145]
 [264 857  79]
 [274 105 821]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.63      0.75      0.68      1200
       links       0.77      0.71      0.74      1200
      rechts       0.79      0.68      0.73      1200

    accuracy                           0.72      3600
   macro avg       0.73      0.72      0.72      3600
weighted avg       0.73      0.72      0.72      3600



#### Support Vector Machine

In [None]:
run_classifier(svc, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.8366666666666667


Confusion Matrix of the classifier: 

[[ 915  135  150]
 [ 104 1053   43]
 [ 113   43 1044]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.81      0.76      0.78      1200
       links       0.86      0.88      0.87      1200
      rechts       0.84      0.87      0.86      1200

    accuracy                           0.84      3600
   macro avg       0.84      0.84      0.84      3600
weighted avg       0.84      0.84      0.84      3600



#### Logistic Regression

In [None]:
run_classifier(lr, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.8013888888888889


Confusion Matrix of the classifier: 

[[ 890  145  165]
 [ 138 1004   58]
 [ 146   63  991]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.76      0.74      0.75      1200
       links       0.83      0.84      0.83      1200
      rechts       0.82      0.83      0.82      1200

    accuracy                           0.80      3600
   macro avg       0.80      0.80      0.80      3600
weighted avg       0.80      0.80      0.80      3600



### ...using trigrams

#### Decision Tree

In [None]:
run_classifier(dt, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.7555555555555555


Confusion Matrix of the classifier: 

[[781 212 207]
 [141 985  74]
 [170  76 954]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.72      0.65      0.68      1200
       links       0.77      0.82      0.80      1200
      rechts       0.77      0.80      0.78      1200

    accuracy                           0.76      3600
   macro avg       0.75      0.76      0.75      3600
weighted avg       0.75      0.76 

#### Naive Bayes

In [None]:
run_classifier(nb, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.6975


Confusion Matrix of the classifier: 

[[1023   31  146]
 [ 440  656  104]
 [ 330   38  832]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.57      0.85      0.68      1200
       links       0.90      0.55      0.68      1200
      rechts       0.77      0.69      0.73      1200

    accuracy                           0.70      3600
   macro avg       0.75      0.70      0.70      3600
weighted avg       0.75      0.70      0.70      3600



#### Support Vector Machine

In [None]:
run_classifier(svc, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.8363888888888888


Confusion Matrix of the classifier: 

[[ 915  134  151]
 [ 105 1051   44]
 [ 113   42 1045]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.81      0.76      0.78      1200
       links       0.86      0.88      0.87      1200
      rechts       0.84      0.87      0.86      1200

    accuracy                           0.84      3600
   macro avg       0.84      0.84      0.84      3600
weighted avg       0.84      0.84      0.84      3600



#### Logistic Regression

In [None]:
run_classifier(lr, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.8030555555555555


Confusion Matrix of the classifier: 

[[ 892  144  164]
 [ 137 1007   56]
 [ 147   61  992]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.76      0.74      0.75      1200
       links       0.83      0.84      0.83      1200
      rechts       0.82      0.83      0.82      1200

    accuracy                           0.80      3600
   macro avg       0.80      0.80      0.80      3600
weighted avg       0.80      0.80      0.80      3600

