In [5]:
import pandas as pd

import sklearn
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import classification_report, confusion_matrix

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

## Preparing data

In [2]:
DATA = 'data/a1989123.csv'
csv = pd.read_csv(DATA)
csv.head()

Unnamed: 0,_unit_id,_unit_state,_trusted_judgments,_last_judgment_at,what_is_the_reason_for_your_answer_,what_type_of_language_is_used_in_this_tweet,what_type_of_language_is_used_in_this_tweet:confidence,id,text,what_is_the_reason_for_your_answer__gold,what_type_of_language_is_used_in_this_tweet_gold
0,3342615631,finalized,3,9/28/2022 00:03:25,Ill-mannered expressions\nSwear words/shit\nSw...,uncivil,1.0,2041171,"That's enough of that shit,thanks itv I've dec...",,
1,3342615632,finalized,3,9/28/2022 23:47:34,Insults (also includes name-calling)\nInsults\...,uncivil,1.0,4006019,I know that Leave has won #Brexit but you shou...,,
2,3342615633,finalized,3,9/28/2022 04:48:38,There is no uncivil language.\nthere is no unc...,civil,1.0,6022678,Extraordinary claim from senior MP that Jeremy...,,
3,3342615634,finalized,3,9/27/2022 04:22:12,There is no uncivil language.\nThere is no unc...,civil,1.0,2821819,#Moscow #SaintPetersburg Controversial comment...,,
4,3342615635,finalized,3,9/28/2022 02:08:52,There is no uncivil language.\nthere is no unc...,civil,1.0,83442,@RachaelMaskell Labour must argue to stay in d...,,


In [6]:
df = pd.DataFrame({'category': csv.what_type_of_language_is_used_in_this_tweet, 'text': csv.text})

vectorizer = CountVectorizer(stop_words='english', max_features=5000)
X = vectorizer.fit_transform(df.text)

labelEncoder = preprocessing.LabelEncoder()
y = labelEncoder.fit_transform(df.category)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

ros = RandomUnderSampler(random_state=42)
X_train, y_train = ros.fit_resample(X_train, y_train)

## Setting up classifiers

In [7]:
#clf = LogisticRegression(random_state=0).fit(X_train, y_train)
#clf = GaussianNB().fit(X_train.toarray(), y_train)
clf = BaggingClassifier(random_state=0).fit(X_train, y_train)


0.9775596072931276

In [13]:
print(clf.score(X_train, y_train))

y_train_pred = clf.predict(X_train)
print(classification_report(y_train, y_train_pred))

y_test_pred = clf.predict(X_test)
print(classification_report(y_test, y_test_pred))

confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
(tn, fp, fn, tp)

0.9775596072931276
              precision    recall  f1-score   support

           0       0.96      0.99      0.98       713
           1       0.99      0.96      0.98       713

    accuracy                           0.98      1426
   macro avg       0.98      0.98      0.98      1426
weighted avg       0.98      0.98      0.98      1426

              precision    recall  f1-score   support

           0       0.94      0.78      0.85      2940
           1       0.25      0.58      0.35       360

    accuracy                           0.76      3300
   macro avg       0.59      0.68      0.60      3300
weighted avg       0.86      0.76      0.80      3300



(2306, 634, 151, 209)

## Additional data predictions

In [15]:
DATA2 = 'data/Appen_sample2.csv'
csv2 = pd.read_csv(DATA2)

In [16]:
csv2.head()

Unnamed: 0,ID,text
0,4414070,The first domino following the #Brexit result ...
1,6766689,Brexit-bored Brits back to bashing the bishop ...
2,5333145,#Cheaper# UK #Holiday But Aviation Regulation ...
3,818855,Financial Times - BoE: Brexit weighing on the ...
4,2103995,If we Brexit I had cunning plan to go to Scotl...


In [28]:
def predict(df):
    print(df.head())
    X = vectorizer.fit_transform(df['text'])
    y_pred = clf.predict(X)
    return y_pred

In [34]:
csv2['label'] = y_pred

In [36]:
csv2['label'] = csv2['label'].map(lambda x: 'uncivil' if x == 1 else 'civil')

In [39]:
csv2.to_csv('data/test_Appen_sample.csv')