# 0010 Correlation coefficients

In [1]:
import pandas as pd

pd.set_option('display.max_rows', None)

In [2]:
features = pd.read_pickle('0000_features_20201124.pkl')

In [3]:
sentence_beginnings = pd.read_pickle('0001_sentence_beginnings_20220130.pkl')
stop_words = pd.read_pickle('0002_stop_words_20220130.pkl')
sentiment = pd.read_pickle('0003_sentiment_20220130.pkl')
unique_sentences = pd.read_pickle('0004_unique_sentences_20220130.pkl')
cohesive_devices = pd.read_pickle('0005_cohesive_devices_20220130.pkl')
passive_voice = pd.read_pickle('0006_passive_voice_20220130.pkl')
grammar_mistakes = pd.read_pickle('0007_grammar_mistakes_20220130.pkl')
writer_invariant = pd.read_pickle('0008_writer_invariant_20220204.pkl')
uppercase = pd.read_pickle('0009_uppercase_20220222.pkl')

In [4]:
df = pd.concat([features,
                sentence_beginnings,
                stop_words,
                sentiment,
                unique_sentences,
                cohesive_devices,
                passive_voice,
                grammar_mistakes,
                writer_invariant,
                uppercase
               ], axis=1)

In [5]:
df.to_pickle('features_20220222.pkl')  # 465

In [6]:
df.columns

Index(['text', 'language', 'views', 'conversions', 'cvr', 'label', 'ari_score',
       'ari_class', 'bormuth_score', 'bormuth_class', 'coleman_liau_score',
       'coleman_liau_class', 'new_dale_chall_score', 'new_dale_chall_class',
       'flesch_score', 'flesch_class', 'flesch_kincaid_score',
       'flesch_kincaid_class', 'fog_score', 'fog_class', 'lix_score',
       'lix_class', 'rix_score', 'rix_class', 'smog_score', 'smog_class',
       'strain_score', 'strain_class', 'readability_score', 'asl_flesch',
       'asw_flesch', 'asl_fog', 'ppw_fog', 'pew', 'pdw', 'pmw', 'ppw', 'psw',
       'puw', 'acs', 'ass', 'aws', 'acw', 'asw', 'sentences', 'words',
       'characters', 'syllables', 'rkmb1', 'rkmb2', 'psw2', 'sentiment',
       'sentiment_discrete', 'pus', 'acds', 'cdw', 'apvs', 'pvw',
       'passive_voice', 'grammar_mistakes', 'agms', 'gmw', 'ans', 'pn', 'avs',
       'pv', 'aas', 'pa', 'apcps', 'ppcp', 'neg', 'negations', 'aus', 'pu'],
      dtype='object')

* pmw -- Percentage of Multicharacter Words
* psw -- Percentage of Selling Words


* rkmb1 -- RKMB ver. 1. (**new**)
* rkmb2 -- RKMB ver. 2. (**new**)


* psw2 -- Percentage of Stop Words (**new**)


* sentiment -- sentiment (**new**)
* sentiment_discrete -- sentiment as discrete value (**new**)


* pus -- Percentage of Unique Sentences (**new**)


* acds -- Average number of Cohesive Devices (transition words and phrases) per Sentence (**new**)
* cdw -- Number of Cohesive Devices (transition words and phrases) per total number of Words in text (**new**)


* apvs -- Average number of Passive Voice constructions per Sentence (**new**)
* pvw -- Number of Passive Voice constructions per total number of Words in text (**new**)
* passive_voice -- passive voice (**new**)


* grammar_mistakes -- number of grammar mistakes (**new**)
* agms -- Average number of Grammar Mistakes per Sentence (**new**)
* gmw -- Number of Grammar Mistakes per total number of Words in text (**new**)


* ans -- Average number of Nouns per Sentence (**new**)
* pn -- Percentage of Nouns in text (**new**)
* avs -- Average number of Verbs per Sentence (**new**)
* pv -- Percentage of Verbs in text (**new**)
* aas -- Average number of Adjectives per Sentence (**new**)
* pa -- Percentage of Adjectives in text (**new**)
* apcps -- Average number of Prepositions, Conjunctions, and Particles per Sentence (**new**)
* ppcp -- Percentage of Prepositions, Conjunctions, and Particles in text (**new**)
* neg -- average number of negations per sentence (**new**)
* negations -- negation(s) in text yes/no (**new**)

* aus -- Average number of Uppercase words per Sentence (**new**)
* pu -- Percentage of Uppercase words in text (**new**)

`0.0 - 0.09` no correlation  
`0.1 - 0.3` low  
`0.3 - 0.5` medium   
`0.5 - 1.0` high

In [7]:
df.corr(method='pearson')['cvr'].abs().sort_values(ascending=False)  # pearson | spearman | kendall

cvr                     1.000000
puw                     0.340194
rkmb2                   0.287256
sentences               0.265740
conversions             0.253485
neg                     0.249363
grammar_mistakes        0.248867
avs                     0.238150
pv                      0.228344
passive_voice           0.220344
pus                     0.214225
pa                      0.198621
characters              0.195236
syllables               0.191222
ppcp                    0.185119
negations               0.183836
words                   0.181639
pdw                     0.176888
new_dale_chall_score    0.172655
apcps                   0.155503
views                   0.151336
acds                    0.145142
label                   0.144629
psw2                    0.142852
new_dale_chall_class    0.141998
cdw                     0.129778
pvw                     0.128712
asl_fog                 0.117052
aws                     0.117052
pu                      0.113765
gmw       

In [8]:
df.corr(method='pearson').cvr.sort_values()  # pearson | spearman | kendall

rkmb2                  -0.287256
sentences              -0.265740
grammar_mistakes       -0.248867
passive_voice          -0.220344
pa                     -0.198621
characters             -0.195236
syllables              -0.191222
words                  -0.181639
pdw                    -0.176888
new_dale_chall_score   -0.172655
views                  -0.151336
label                  -0.144629
new_dale_chall_class   -0.141998
pvw                    -0.128712
pu                     -0.113765
gmw                    -0.110997
apvs                   -0.109230
ppw                    -0.103137
asw_flesch             -0.100058
pew                    -0.094010
acw                    -0.085432
agms                   -0.083390
aus                    -0.081520
flesch_class           -0.076434
pmw                    -0.068851
aas                    -0.062816
pn                     -0.060295
asw                    -0.055716
bormuth_score          -0.043023
fog_class              -0.039185
ari_score 