# First analysis

### Importing libraries and settings

In [None]:
!pip install Textstat

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import textstat
from sklearn import preprocessing
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
import math

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

### Reading competition training data

In [None]:
all_df = pd.read_csv('/kaggle/input/commonlitreadabilityprize/train.csv')
df = all_df[['id', 'excerpt', 'target', 'standard_error']]

In [None]:
print(all_df.shape, '\n')
all_df.head()

In [None]:
df['target'].hist(bins=30)

### Reading English frequency data I found

In [None]:
with open('/kaggle/input/en-word-frequency/en_50k.txt', 'r') as f:
    data = f.read()
    data = data.split()
    
    words = data[0::2]
    freqs = data[1::2]
    
    freq = defaultdict(lambda: 0, {k: int(v) for k, v in zip(words, freqs)})

In [None]:
demo_str = "I am a nasty old pirate, get close to me lad and you'll know all the secrests across all seven seas"

In [None]:
a = [1,2,3,4,2,2,2,2]
a.remove(2)
a

In [None]:
stopwords_list = stopwords.words('english')

In [None]:
a = [word for word in demo_str.split() if word not in stopwords_list]
a

In [None]:
# TODO: right now the final value is heavily influenced by stopwords that are very common, we should get rid of them.
def rarity(excerpt):
    """the lower the output is, the more rare the excerpt is"""
    excerpt = excerpt.lower()
    excerpt = excerpt.replace('?', '').replace('.', '').replace(',', '').replace('!', '')
    
    excerpt_words = {word for word in excerpt.split() if word not in stopwords_list}
    
    suma = 0
    for word in excerpt_words:
        suma += freq[word]

    avg = suma / len(excerpt.split())

    return int(avg)

### This is the more complex excerpt in the train data:

In [None]:
df.sort_values('target')['excerpt'].iloc[0]

In [None]:
rarity(df.sort_values('target')['excerpt'].iloc[0])

### This is the easiest excerpt in the train data:

In [None]:
df.sort_values('target')['excerpt'].iloc[-1]

In [None]:
rarity(df.sort_values('target')['excerpt'].iloc[-1])

### applying rarity index to all the datapoints:

In [None]:
df['rarity'] = df['excerpt'].apply(lambda value: rarity(value))

# standarizing between 0 an 1:
minimum = df['rarity'].min()
maximum = df['rarity'].max()


df['rarity'] = (df['rarity'] - minimum)/(maximum - minimum)

In [None]:
df['rarity'].hist(bins=30)

In [None]:
df['longitude'] = df['excerpt'].apply(lambda value: len(value))

In [None]:
df['longitude'].hist(bins=30)
# we need to do something about the difference in the ammount of words per excerpt.

In [None]:
df['punctuation_marks'] = df['excerpt'].apply(lambda value: value.count('?') + value.count('.') + value.count(',') + value.count('!'))

In [None]:
def num_unique_words(text):
    text = text.lower()
    text = text.replace('?', '').replace('.', '').replace(',', '').replace('!', '')
    
    text_set = set(text.split())
    return len(text_set)

df['num_diff_words'] = df['excerpt'].apply(lambda value: num_unique_words(value))

In [None]:
# flesch_reading_ease test
df['fre_test'] = df['excerpt'].apply(lambda value: textstat.flesch_reading_ease(value))

# standarizing between 0 an 1:
minimum = df['fre_test'].min()
maximum = df['fre_test'].max()


df['fre_test'] = (df['fre_test'] - minimum)/(maximum - minimum)

In [None]:
# flesch_reading_ease test
df['fkg_test'] = df['excerpt'].apply(lambda value: textstat.flesch_kincaid_grade(value))

# standarizing between 0 an 1:
minimum = df['fkg_test'].min()
maximum = df['fkg_test'].max()


df['fkg_test'] = (df['fkg_test'] - minimum)/(maximum - minimum)

In [None]:
# flesch_reading_ease test
df['gf_test'] = df['excerpt'].apply(lambda value: textstat.gunning_fog(value))

# standarizing between 0 an 1:
minimum = df['gf_test'].min()
maximum = df['gf_test'].max()


df['gf_test'] = (df['gf_test'] - minimum)/(maximum - minimum)

In [None]:
# flesch_reading_ease test
df['s_test'] = df['excerpt'].apply(lambda value: textstat.smog_index(value))

# standarizing between 0 an 1:
minimum = df['s_test'].min()
maximum = df['s_test'].max()


df['s_test'] = (df['s_test'] - minimum)/(maximum - minimum)

In [None]:
# flesch_reading_ease test
df['dcrs_test'] = df['excerpt'].apply(lambda value: textstat.dale_chall_readability_score(value))

# standarizing between 0 an 1:
minimum = df['dcrs_test'].min()
maximum = df['dcrs_test'].max()


df['dcrs_test'] = (df['dcrs_test'] - minimum)/(maximum - minimum)

In [None]:
df['dcrs_test'].hist(bins=30)

In [None]:
# standarize numeric values by the number of words in the excerpt:
df['punctuation_marks'] = df['punctuation_marks'] / df['longitude']
df['num_diff_words'] = df['num_diff_words'] / df['longitude']

In [None]:
df.describe()

In [None]:
df.corr()

In [None]:
df.plot.scatter(x='rarity', y='target')

In [None]:
df.plot.scatter(x='dcrs_test', y='target')

In [None]:
df.plot.scatter(x='dcrs_test', y='rarity', color='red')

In [None]:
df.plot.scatter(x='punctuation_marks', y='fre_test', c='target', colormap='viridis')

In [None]:
df.plot.scatter(x='rarity', y='fre_test', c='target', colormap='viridis')

In [None]:
df.plot.scatter(x='dcrs_test', y='rarity', c='target', colormap='viridis')

In [None]:
from sklearn.svm import SVR

In [None]:
df.head(1)

In [None]:
Y = df['target'].values

In [None]:
X = df[['rarity', 'fre_test']].values
X

In [None]:
regr = SVR()
regr.fit(X, Y)

pred = regr.predict(X)
df['pred'] = pred

df['error'] = (df['target'] - df['pred'])**2

mse = df['error'].mean()
rmse = math.sqrt(mse)
rmse

In [None]:
X = df[['rarity', 'dcrs_test']].values
X

In [None]:
regr = SVR()
regr.fit(X, Y)

pred = regr.predict(X)
df['pred'] = pred

df['error'] = (df['target'] - df['pred'])**2

mse = df['error'].mean()
rmse = math.sqrt(mse)
rmse

In [None]:
# With a C=2 to see what happends

regr = SVR(C=2, kernel='poly', degree=5)
regr.fit(X, Y)

pred = regr.predict(X)
df['pred'] = pred

df['error'] = (df['target'] - df['pred'])**2

mse = df['error'].mean()
rmse = math.sqrt(mse)
rmse

In [None]:
# you can data-augmentate your dataset with this:
from itertools import chain
from nltk.corpus import wordnet

synonyms = wordnet.synsets('creature')
lemmas = set(chain.from_iterable([word.lemma_names() for word in synonyms]))
lemmas