**Pipeline Code for Politeness Rating Prediction**


The following code imports the libraries used for the prediction code

In [55]:
import sys
import math
import pandas as pd
import vaderSentiment
import textstat
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
#note: depending on how you installed (e.g., using source code download versus pip install), you may need to import like this:
#from vaderSentiment import SentimentIntensityAnalyzer
import spacy
import requests
import json
import sys
import os
from itertools import zip_longest
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.datasets import make_classification
from scipy import sparse
import pickle
import numpy as np
from sklearn import linear_model
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import accuracy_score
import politeness.api_util
from politeness.api_util import get_scores_strategies_token_indices
import warnings
warnings.filterwarnings("ignore")

**Global Variables declarations:**

In [3]:
#global variables:
#used in logistic regression
L2_REGULARIZATION_STRENGTH = 0.9

#headers and parameters for perspective api call
headers = {
    'Content-Type': 'application/json',
}

params = (
    ('key', 'AIzaSyBaMPpybrBfyWF54hvkFK1QuEBPPKmQh8M'),
)


The code below describes a function for reading data:

In [4]:
def readData(file_name):
    """
    Reads from the data file and returns data frame
    :param file_name: reads the file name
    :return: return a data frame read from file
    """
    data = pd.read_csv(file_name)
    return data

**Feature Extraction**

The code below describes the function for encoding features:
The following features are included-
1. Sentiment Scores: pos, neg and neu
easiness to read scales:
2. Flesch reading,
3. Dale_chall reading,
4. Gunning_foc score,
5. Smog_index and
6. Text standard scores.
all these scores are included in the entire feature set
7. Perspective api scores (toxicity scores for the entire text)
8. Politeness score
9. Impolite-ness score
10. Politeness strategies
11. POS tags

For using any feature, please comment out any particular feature's comment string using '#' token.
These features formed creates a feature matrix to be used by the code for prediction in the following code.

In [23]:
def feature_encoder(dataobjects):
    """
    Features included in the code are:
    1. sentiment scores: pos, neg and neu
    easiness to read scales:
        2. flesch reading,
        3. dale_chall reading,
        4. gunning_foc score,
        5. smog_index and
        6. text standard scores.
        all these scores are included in the entire feature set
    7. perspective api scores (toxicity scores for the entire text)
    8. politeness score
    9. impolite-ness score
    10. politeness strategies
    11. POS tags

    :param dataobjects: reads the data objects (data frame) which incorporate the text
    :return: a feature encoded matrix of numeric entities for the entire data set
    """
    nlp = spacy.load('en_core_web_sm')
    feature_dict = {}
    feature_set = {}

    cnt=0
    for line in dataobjects:
        if cnt == 0:
            cnt=1
            continue
        feature_dict[cnt]={}
        text = line[2]
        #sentiment scores: scores with pos, neg and neutral scores:
        #feature_sentiment_comment_string = """
        analyzer = SentimentIntensityAnalyzer()
        vs = analyzer.polarity_scores(text)
        feature_dict[cnt]['pos']=vs['pos']
        feature_dict[cnt]['neg']=vs['neg']
        feature_dict[cnt]['neu']=vs['neu']
        feature_set['pos']=1
        feature_set['neg']=1
        feature_set['neu']=1
        #"""
        
        #easiness to read scores: flesch reading:
        feature_flesch_reading_comment_string = """
        sc = textstat.flesch_reading_ease(text)
        feature_dict[cnt]['easiness']=sc
        feature_set['easiness']=1
        #"""
        
        #easiness to read scores: dale chall reading:
        feature_dale_chall_comment_string = """
        sc = textstat.dale_chall_readability_score(text)
        feature_dict[cnt]['easiness_dale']=sc
        feature_set['easines_dale']=1
        #"""
        
        #easiness to read scores: gunning fog reading:
        feature_gunning_fog_comment_string = """
        sc = textstat.gunning_fog(text)
        feature_dict[cnt]['easiness_fog']=sc
        feature_set['easines_fog']=1
        #"""
        
        #easiness to read scores: smog index reading:
        feature_smog_index_comment_string = """
        sc = textstat.smog_index(text)
        feature_dict[cnt]['easiness_smog']=sc
        feature_set['easines_smog']=1
        #"""
        
        #easiness to read scores: text standard reading:
        feature_txt_standard_comment_string = """
        sc = textstat.text_standard(text, float_output=False)
        feature_dict[cnt]['easiness_standard']=sc
        feature_set['easines_standard']=1
        #"""

        #preprocessing text to make readable for perspective api scores:
        feature_perspective_api_string = """
        stry = str(text)
        sent = ''
        for a in stry:
            if a==' ' or (a<='Z' and a>='A') or (a<='z' and a>='a') or (a<='9' and a>='0') or a=='?' or a=='.':
                sent +=a

        #perspective api scores call:
        data = '{comment: {text:"'+sent+'"}, languages: ["en"], requestedAttributes: {TOXICITY:{}} }'
        response = requests.post('https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze', headers=headers, params=params, data=data)
        j = json.loads(response.text)
        feature_dict[cnt]['toxicity'] =0.0
        try:
            feature_dict[cnt]['toxicity'] = j['attributeScores']['TOXICITY']['summaryScore']['value']
        except:
            try:
                feature_dict[cnt]['toxicity'] = j['attributeScores']['TOXICITY']['summaryScore']['value']
            except:
                try:
                    feature_dict[cnt]['toxicity'] = j['attributeScores']['TOXICITY']['summaryScore']['value']
                except:
                    try:
                        feature_dict[cnt]['toxicity'] = j['attributeScores']['TOXICITY']['summaryScore']['value']
                    except:
                        feature_dict[cnt]['toxicity'] =0.0
        feature_dict[cnt]['toxicity'] =0.0
        feature_set['toxicity']=1
        #"""
        
        #politeness strategies and politeness scores features:
        feature_politeness_score_comment_string = """
        sc = get_scores_strategies_token_indices(text)
        feature_dict[cnt]['score_polite']=sc['score_polite']
        feature_dict[cnt]['score_impolite'] = sc['score_impolite']
        feature_set['score_polite']=1
        feature_set['score_impolite']=1
    
        #print(feature_dict[cnt]['score_polite'])
        for a in sc['strategies']:
            feature_dict[cnt][a]=1
            feature_set[a]=1
        #"""
        
        #POS tags in the text:
        feature_pos_comment_string = """
        doc = nlp(text)
        for token in doc:
            if (str(token.pos_) not in feature_set):
                feature_set[str(token.pos_)]=1

            if not (str(token.pos_) in feature_dict[cnt]):
                feature_dict[cnt][str(token.pos_)]=1
            else:
                feature_dict[cnt][str(token.pos_)]+=1
        #"""
        cnt+=1

    #creating a systematic feature matrix from feature set
    feature_matrix = []
    for i in range(1, cnt):
        feature_list = []
        for key in feature_set.keys():
            if key in feature_dict[i]:
                feature_list.append(feature_dict[i][key])
            else:
                feature_list.append(0.0)
        feature_matrix.append(feature_list)

    return feature_matrix

**Reading Data**:

The following code is used for reading any particular data file.
The data file used for reading and extracting features is accessed via the 
'df' dataframe object. This object makes an assumption for using 3rd column (2nd index) for getting text records.

Please change the name of the file with the correct path for using some other data objects.

In [39]:
#calling read data
df = readData('Project_Politeness/Batch_Binary_Scores.csv')
df_labels = readData('three_labels_data.csv')

#list of lists of data frame objects, dataobjects: list of list of training modules, labelobjects: list of list of test modules
dataobjects = df.values.tolist()
labelobjects = df_labels.values.tolist()


The code below calls the feature_encoder method for the training dataset as follows:

In [40]:
feature_train_matrix = feature_encoder(dataobjects)

**Classifier Creation:**

The following code uses different type of classifiers. Each of the classifiers is instantiated in 'clf' variable

Using **Random Forest Classifier** as a classifier:

In [27]:
clf = RandomForestClassifier(n_estimators=100, max_depth=2,random_state=0)

Using **Logistic Regression** as a classifier:

In [41]:
clf = linear_model.LogisticRegression(C=L2_REGULARIZATION_STRENGTH, penalty='l2', n_jobs=4)

Using **Gaussian Naive Bayes** as a classifier:

In [29]:
clf = GaussianNB(priors=None, var_smoothing=1e-09)

Using **Multinomial Naive Bayes** as a classifier:

In [30]:
clf = MultinomialNB()

Using **Support Vector Machines** as a classifier:

In [31]:
clf = SVC(kernel = 'linear', C = 1, probability = True)

**Creating Class Labels**

The code below is used for creating class labels with the feature matrix for training dataset

**English Dataset:**

In [42]:
#getting class labels and appending to feature matrix (english)
Y = []
cnt = 0
for line in dataobjects:
    if cnt==0:
        cnt=1
        continue
    feature_train_matrix[cnt-1].append(line[-2])
    cnt+=1

X = np.array(feature_train_matrix)
Xtrain = np.array(X[:,:-1])
cnt = 0
for line in dataobjects:
    if cnt==0:
        cnt=1
        continue
    Y.append(X[cnt-1][-1])
    cnt+=1

Fitting the classifier on the dataset: **English Dataset**

Creating test classes and predictions.


In [56]:
#fitting model on the dataset
clf.fit(Xtrain[:1000],Y[:1000])

#Creating feature matrix for test set
#feature_label_matrix = feature_encoder(labelobjects)
#Xtester = np.array(feature_label_matrix)
Xtest = Xtrain[1000:]

#Predicting class labels for dataset: english
YtestEn = clf.predict(Xtest)


The code below is used for creating class labels with the feature matrix for training dataset

**Chinese Dataset:**

In [57]:
#getting class labels and appending to feature matrix (Chinese)
Y = []
cnt = 0
for line in dataobjects:
    if cnt==0:
        cnt=1
        continue
    feature_train_matrix[cnt-1][-1] = line[-1]
    cnt+=1

X = np.array(feature_train_matrix)
Xtrain = np.array(X[:,:-1])
cnt = 0
for line in dataobjects:
    if cnt==0:
        cnt=1
        continue
    Y.append(X[cnt-1][-1])
    cnt+=1


Fitting the classifier on the dataset: **Chinese Dataset**

Creating test classes and predictions.

In [58]:
#fitting model on the dataset
clf.fit(Xtrain[:1000],Y[:1000])

#Creating feature matrix for test set
#feature_label_matrix = feature_encoder(labelobjects)
#Xtester = np.array(feature_label_matrix)
Xtest = Xtrain[1000:]

#Predicting class labels for dataset: chinese
YtestCh = log_reg.predict(Xtest[:])

Getting precision recall f_score and accuracy scores for the dataset: **English Dataset**

In [59]:
#Getting precision recall f_score and accuracy scores for the dataset
scores = precision_recall_fscore_support(Y[1000:], YtestEn, average='micro')
acc = accuracy_score(Y[1000:], YtestEn)
print('English\nprecision:',scores[0],' recall: ',scores[1],' f-score: ',scores[2],' accuracy: ',acc)

English
precision: 0.7449664429530202  recall:  0.7449664429530202  f-score:  0.7449664429530202  accuracy:  0.7449664429530202


Getting precision recall f_score and accuracy scores for the dataset: **Chinese Dataset**

In [61]:
#Getting precision recall f_score and accuracy scores for the dataset
scores = precision_recall_fscore_support(Y[1000:], YtestCh, average='micro')
acc = accuracy_score(Y[1000:], YtestCh)
print('Chinese\nprecision:',scores[0],' recall: ',scores[1],' f-score: ',scores[2],' accuracy: ',acc)


Chinese
precision: 0.7449664429530202  recall:  0.7449664429530202  f-score:  0.7449664429530202  accuracy:  0.7449664429530202


In [54]:
#To be used only with label data
#Comparison model for english and chinese results:
if len(YtestEn)!=len(YtestCh):
    print('Not same length')
for i in range(len(YtestEn)):
    if YtestEn[i]!=YtestCh[i]:
        print(1)
    else:
        print(0)

0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
