In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.utils import shuffle, resample

## Load data 

In [2]:
news = pd.read_csv("/home/jupyter-karan_singh/FinalDataset/Stepwords_Clean_FinalDataset_ForModel_V3.csv", index_col=0)

In [3]:
# Get the same train and test data
def split_df_in_train_test(df):
    df = df.reset_index()
    split_point = int(np.round(df.shape[0]) * 0.8)
    df_train = df.loc[:split_point-1,:]
    df_test = df.loc[split_point:,:]
    return df_train, df_test

In [4]:
n_samples = 6000

In [5]:
left_FE = resample(shuffle(news[(news["label"]=="links") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
leanLeft_FE = resample(shuffle(news[(news["label"]=="halbLinks") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
center_FE = resample(shuffle(news[(news["label"]=="central") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
leanRight_FE = resample(shuffle(news[(news["label"]=="halbRechts") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)
right_FE = resample(shuffle(news[(news["label"]=="rechts") & (news["Länge"]<512)], random_state=42), \
         random_state=42, n_samples=n_samples)

In [6]:
train = pd.concat([split_df_in_train_test(left_FE)[0], \
    split_df_in_train_test(leanLeft_FE)[0], \
    split_df_in_train_test(center_FE)[0], \
    split_df_in_train_test(leanRight_FE)[0], \
    split_df_in_train_test(right_FE)[0]])

In [7]:
test =  pd.concat([split_df_in_train_test(left_FE)[1], \
    split_df_in_train_test(leanLeft_FE)[1], \
    split_df_in_train_test(center_FE)[1], \
    split_df_in_train_test(leanRight_FE)[1], \
    split_df_in_train_test(right_FE)[1]])

In [8]:
X_train, y_train = train["NeuGenerierterText_str"], train["label"]
X_test, y_test = test["NeuGenerierterText_str"], test["label"]

# Creating German Stopwords

In [9]:
!pip install stop-words

Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.
To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.
Defaulting to user installation because normal site-packages is not writeable


In [10]:
from stop_words import get_stop_words

#stop_words = get_stop_words('de')
stop_words = get_stop_words('german')

## Generate TFIDF vector

### Bigram:

In [11]:
bigram_vec = TfidfVectorizer(stop_words=stop_words, max_features=30000, ngram_range=(1, 2))

X_train_bi = bigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_bi = bigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

### Trigram

In [12]:
trigram_vec = TfidfVectorizer(stop_words=stop_words, max_features=30000, ngram_range=(1, 3))

X_train_tri = trigram_vec.fit_transform(X_train.apply(lambda x: np.str_(x)))
X_test_tri = trigram_vec.transform(X_test.apply(lambda x: np.str_(x)))

## Generate LabelEncoder

In [13]:
label_enc = LabelEncoder()
y_train_enc = label_enc.fit_transform(y_train)
y_test_enc = label_enc.fit_transform(y_test)

In [14]:
label_enc.inverse_transform([0, 1, 2, 3, 4]) 

array(['central', 'halbLinks', 'halbRechts', 'links', 'rechts'],
      dtype=object)

In [15]:
label = [0, 1, 2, 3, 4]
target_label = ['central', 'halbLinks', 'halbRechts', 'links', 'rechts']

## Apply classifiers

In [16]:
def run_classifier(clf, X_train, X_test, y_train, y_test, label, target_label):
    
    print("Training of the classifier: {} \n".format(clf))
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    print("\n")

    print("Accuracy of the classifier:     ")
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy)

    print("\n")

    print("Confusion Matrix of the classifier: \n")
    con_mat = confusion_matrix(y_test, y_pred, labels=label)
    print(con_mat)

    print("\n")

    print("Classification Report of the classifier: \n")
    report = classification_report(y_test, y_pred, target_names=target_label)
    print(report)

In [17]:
dt = DecisionTreeClassifier(random_state=42)
svc = LinearSVC()
lr = LogisticRegression(multi_class="multinomial", solver="saga")
nb = BernoulliNB()

### ...using with bigrams

#### Decision Tree

In [18]:
run_classifier(dt, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.7498333333333334


Confusion Matrix of the classifier: 

[[ 828   55  133  104   80]
 [  43 1005   58   44   50]
 [ 157   67  748  125  103]
 [  65   34   84  972   45]
 [  74   50   91   39  946]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.71      0.69      0.70      1200
   halbLinks       0.83      0.84      0.83      1200
  halbRechts       0.67      0.62      0.65      1200
       links       0.76      0.81      0.78      1200
      rechts    

#### Naive Bayes

In [19]:
run_classifier(nb, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.6188333333333333


Confusion Matrix of the classifier: 

[[668  29 374  94  35]
 [ 60 786 236  61  57]
 [210  57 834  38  61]
 [198  14 249 701  38]
 [ 84  79 252  61 724]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.55      0.56      0.55      1200
   halbLinks       0.81      0.66      0.73      1200
  halbRechts       0.43      0.69      0.53      1200
       links       0.73      0.58      0.65      1200
      rechts       0.79      0.60      0.68      1200

    accuracy                           0.62      6000
   macro avg       0.66      0.62      0.63      6000
weighted avg       0.66      0.62      0.63      6000



#### Support Vector Machine

In [20]:
run_classifier(svc, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.8021666666666667


Confusion Matrix of the classifier: 

[[ 895   38  123   86   58]
 [  38 1068   34   21   39]
 [ 148   63  819   90   80]
 [  60   14   65 1020   41]
 [  56   41   56   36 1011]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.75      0.75      0.75      1200
   halbLinks       0.87      0.89      0.88      1200
  halbRechts       0.75      0.68      0.71      1200
       links       0.81      0.85      0.83      1200
      rechts       0.82      0.84      0.83      1200

    accuracy                           0.80      6000
   macro avg       0.80      0.80      0.80      6000
weighted avg       0.80   

#### Logistic Regression

In [21]:
run_classifier(lr, X_train_bi, X_test_bi, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.7506666666666667


Confusion Matrix of the classifier: 

[[ 850   42  153   96   59]
 [  45 1014   53   30   58]
 [ 172   66  779   89   94]
 [ 108   21   83  936   52]
 [  91   58   64   62  925]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.67      0.71      0.69      1200
   halbLinks       0.84      0.84      0.84      1200
  halbRechts       0.69      0.65      0.67      1200
       links       0.77      0.78      0.78      1200
      rechts       0.78      0.77      0.77      1200

    accuracy                    

### ...using trigrams

#### Decision Tree

In [22]:
run_classifier(dt, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 



Accuracy of the classifier:     
0.7416666666666667


Confusion Matrix of the classifier: 

[[ 835   49  130  107   79]
 [  38 1010   51   38   63]
 [ 174   78  718  110  120]
 [  85   32   79  954   50]
 [  79   51   83   54  933]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.69      0.70      0.69      1200
   halbLinks       0.83      0.84      0.83      1200
  halbRechts       0.68      0.60      0.64      1200
       links       0.76      0.80      0.77      1200
      rechts    

#### Naive Bayes

In [26]:
run_classifier(nb, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True) 



Accuracy of the classifier:     
0.586


Confusion Matrix of the classifier: 

[[674  28 442  20  36]
 [ 65 783 276  14  62]
 [213  53 859  12  63]
 [226  13 449 462  50]
 [ 90  68 287  17 738]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.53      0.56      0.55      1200
   halbLinks       0.83      0.65      0.73      1200
  halbRechts       0.37      0.72      0.49      1200
       links       0.88      0.39      0.54      1200
      rechts       0.78      0.61      0.69      1200

    accuracy                           0.59      6000
   macro avg       0.68      0.59      0.60      6000
weighted avg       0.68      0.59      0.60      6000



#### Support Vector Machine

In [27]:
run_classifier(svc, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0) 



Accuracy of the classifier:     
0.8008333333333333


Confusion Matrix of the classifier: 

[[ 895   39  123   90   53]
 [  40 1064   33   22   41]
 [ 144   69  822   89   76]
 [  63   14   65 1018   40]
 [  59   41   59   35 1006]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.75      0.75      0.75      1200
   halbLinks       0.87      0.89      0.88      1200
  halbRechts       0.75      0.69      0.71      1200
       links       0.81      0.85      0.83      1200
      rechts       0.83      0.84      0.83      1200

    accuracy                           0.80      6000
   macro avg       0.80      0.80      0.80      6000
weighted avg       0.80   

#### Logistic Regression

In [28]:
run_classifier(lr, X_train_tri, X_test_tri, y_train_enc, y_test_enc, label, target_label)

Training of the classifier: LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='saga', tol=0.0001, verbose=0,
                   warm_start=False) 



Accuracy of the classifier:     
0.752


Confusion Matrix of the classifier: 

[[ 858   41  150   94   57]
 [  44 1015   55   28   58]
 [ 178   65  774   90   93]
 [ 107   21   82  935   55]
 [  90   59   60   61  930]]


Classification Report of the classifier: 

              precision    recall  f1-score   support

     central       0.67      0.71      0.69      1200
   halbLinks       0.85      0.85      0.85      1200
  halbRechts       0.69      0.65      0.67      1200
       links       0.77      0.78      0.78      1200
      rechts       0.78      0.78      0.78      1200

    accuracy                           0.75  