In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import shuffle, resample

## Load data 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
news = pd.read_csv("/content/drive/MyDrive/Masterarbeit/Crawler/data/FinalDataset/Stepwords_Clean_FinalDataset_ForModel_V5.csv", index_col=0)

In [None]:
# Get the same train and test data
def split_df_in_train_test(df):
    df = df.reset_index()
    split_point = int(np.round(df.shape[0]) * 0.8)
    df_train = df.loc[:split_point-1,:]
    df_test = df.loc[split_point:,:]
    return df_train, df_test

In [None]:
n_samples = 6000

In [None]:
left_FE = resample(shuffle(news[(news["label"]=="links") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
leanLeft_FE = resample(shuffle(news[(news["label"]=="halbLinks") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
center_FE = resample(shuffle(news[(news["label"]=="central") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
leanRight_FE = resample(shuffle(news[(news["label"]=="halbRechts") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
right_FE = resample(shuffle(news[(news["label"]=="rechts") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)

In [None]:
train = pd.concat([split_df_in_train_test(left_FE)[0], \
    split_df_in_train_test(leanLeft_FE)[0], \
    split_df_in_train_test(center_FE)[0], \
    split_df_in_train_test(leanRight_FE)[0], \
    split_df_in_train_test(right_FE)[0]])

In [None]:
test =  pd.concat([split_df_in_train_test(left_FE)[1], \
    split_df_in_train_test(leanLeft_FE)[1], \
    split_df_in_train_test(center_FE)[1], \
    split_df_in_train_test(leanRight_FE)[1], \
    split_df_in_train_test(right_FE)[1]])

In [None]:
X_train, y_train = train["NeuGenerierterText_str"], train["label"]
X_test, y_test = test["NeuGenerierterText_str"], test["label"]

# Creating German Stopwords

In [None]:
!pip install stop-words

Collecting stop-words
  Downloading https://files.pythonhosted.org/packages/1c/cb/d58290804b7a4c5daa42abbbe2a93c477ae53e45541b1825e86f0dfaaf63/stop-words-2018.7.23.tar.gz
Building wheels for collected packages: stop-words
  Building wheel for stop-words (setup.py) ... [?25l[?25hdone
  Created wheel for stop-words: filename=stop_words-2018.7.23-cp36-none-any.whl size=32919 sha256=49778d83c91eeea11cb089e063b978889122e302af27246e285d263da910c24f
  Stored in directory: /root/.cache/pip/wheels/75/37/6a/2b295e03bd07290f0da95c3adb9a74ba95fbc333aa8b0c7c78
Successfully built stop-words
Installing collected packages: stop-words
Successfully installed stop-words-2018.7.23


In [None]:
from stop_words import get_stop_words

#stop_words = get_stop_words('de')
stop_words = get_stop_words('german')

## Generate TFIDF vector

### Bigram:

In [None]:
bigram_vec = TfidfVectorizer(stop_words=stop_words, max_features=30000, ngram_range=(1, 2))

X_train_bi = bigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_bi = bigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

### Trigram

In [None]:
trigram_vec = TfidfVectorizer(stop_words=stop_words, max_features=30000, ngram_range=(1, 3))

X_train_tri = trigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_tri = trigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

## Generate LabelEncoder

In [None]:
label_enc = LabelEncoder()
y_train_enc = label_enc.fit_transform(y_train)
y_test_enc = label_enc.fit_transform(y_test)

In [None]:
label_enc.inverse_transform([0, 1, 2, 3, 4]) 

array(['central', 'halbLinks', 'halbRechts', 'links', 'rechts'],
      dtype=object)

In [None]:
label = [0, 1, 2, 3, 4]
target_label = ['central', 'halbLinks', 'halbRechts', 'links', 'rechts']

## Apply classifiers

In [None]:
def run_classifier(clf, X_train, X_test, y_train, y_test, label, target_label):
    
    print("Training of the classifier: {} \n".format(clf))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\n")

    print("Accuracy of the classifier:     ")
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)

    print("\n")

    print("Confusion Matrix of the classifier: \n")
    con_mat = confusion_matrix(y_test, y_pred, labels=label)
    print(con_mat)

    print("\n")

    print("Classification Report of the classifier: \n")
    report = classification_report(y_test, y_pred, target_names=target_label)
    print(report)

In [None]:
dt = DecisionTreeClassifier(random_state=42)
svc = LinearSVC()
lr = LogisticRegression(multi_class="multinomial", solver="saga")
nb = BernoulliNB()

### ...using with bigrams

#### Decision Tree

In [None]:
run_classifier(dt, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.7453333333333333


Confusion Matrix of the classifier: 

[[ 817   36  149   92  106]
 [  41 1020   62   33   44]
 [ 168   69  744  112  107]
 [  93   37   74  946   50]
 [  63   54   89   49  945]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.69      0.68      0.69      1200
   halbLinks       0.84      0.85      0.84      1200
  halbRechts       0.67      0.62      0.64      1200
       links       0.77      0.79      0.78     

#### Naive Bayes

In [None]:
run_classifier(nb, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.6198333333333333


Confusion Matrix of the classifier: 

[[676  22 347 116  39]
 [ 81 780 208  63  68]
 [215  55 825  43  62]
 [193  14 241 705  47]
 [ 80  82 245  60 733]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.54      0.56      0.55      1200
   halbLinks       0.82      0.65      0.72      1200
  halbRechts       0.44      0.69      0.54      1200
       links       0.71      0.59      0.64      1200
      rechts       0.77      0.61      0.68      1200

    accuracy                           0.62      6000
   macro avg       0.66      0.62      0.63      6000
weighted avg       0.66      0.62      0.63      6000



#### Support Vector Machine

In [None]:
run_classifier(svc, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.801


Confusion Matrix of the classifier: 

[[ 914   36  118   78   54]
 [  23 1055   42   25   55]
 [ 146   80  813   80   81]
 [  77   10   60 1014   39]
 [  52   38   65   35 1010]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.75      0.76      0.76      1200
   halbLinks       0.87      0.88      0.87      1200
  halbRechts       0.74      0.68      0.71      1200
       links       0.82      0.84      0.83      1200
      rechts       0.82      0.84      0.83      1200

    accuracy                           0.80      6000
   macro avg       0.80      0.80      0.80      6000
weighted avg       0.80      0.80      

#### Logistic Regression

In [None]:
run_classifier(lr, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.7496666666666667


Confusion Matrix of the classifier: 

[[884  38 110 117  51]
 [ 43 992  44  56  65]
 [177  80 759  88  96]
 [118  16  78 939  49]
 [ 87  61  69  59 924]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.68      0.74      0.70      1200
   halbLinks       0.84      0.83      0.83      1200
  halbRechts       0.72      0.63      0.67      1200
       links       0.75      0.78      0.76      1200
      rechts       0.78      0.77      0.77      1200

    accuracy                           0.75      6000
   

### ...using trigrams

#### Decision Tree

In [None]:
run_classifier(dt, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.7433333333333333


Confusion Matrix of the classifier: 

[[ 822   46  146   99   87]
 [  36 1021   62   31   50]
 [ 173   79  725  104  119]
 [  70   34   78  960   58]
 [  70   54   82   62  932]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.70      0.69      0.69      1200
   halbLinks       0.83      0.85      0.84      1200
  halbRechts       0.66      0.60      0.63      1200
       links       0.76      0.80      0.78     

#### Naive Bayes

In [None]:
run_classifier(nb, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.5935


Confusion Matrix of the classifier: 

[[693  20 418  30  39]
 [ 84 793 236  19  68]
 [222  52 855   8  63]
 [221  12 440 470  57]
 [ 84  74 272  20 750]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.53      0.58      0.55      1200
   halbLinks       0.83      0.66      0.74      1200
  halbRechts       0.38      0.71      0.50      1200
       links       0.86      0.39      0.54      1200
      rechts       0.77      0.62      0.69      1200

    accuracy                           0.59      6000
   macro avg       0.68      0.59      0.60      6000
weighted avg       0.68      0.59      0.60      6000



#### Support Vector Machine

In [None]:
run_classifier(svc, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.8021666666666667


Confusion Matrix of the classifier: 

[[ 920   36  117   76   51]
 [  22 1058   41   26   53]
 [ 144   78  814   81   83]
 [  77   13   61 1010   39]
 [  53   37   66   33 1011]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.76      0.77      0.76      1200
   halbLinks       0.87      0.88      0.87      1200
  halbRechts       0.74      0.68      0.71      1200
       links       0.82      0.84      0.83      1200
      rechts       0.82      0.84      0.83      1200

    accuracy                           0.80      6000
   macro avg       0.80      0.80      0.80      6000
weighted avg       0.80   

#### Logistic Regression

In [None]:
run_classifier(lr, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.7508333333333334


Confusion Matrix of the classifier: 

[[886  37 109 117  51]
 [ 44 996  44  52  64]
 [179  76 760  91  94]
 [118  17  79 939  47]
 [ 88  59  69  60 924]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.67      0.74      0.70      1200
   halbLinks       0.84      0.83      0.84      1200
  halbRechts       0.72      0.63      0.67      1200
       links       0.75      0.78      0.76      1200
      rechts       0.78      0.77      0.78      1200

    accuracy                           0.75      6000
   