In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import sent_tokenize
import nltk
nltk.download('punkt')
import re
from IPython.core.display import display, HTML
import spacy
from spacy import displacy
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

In [2]:
train = pd.read_csv('../input/feedback-prize-effectiveness/train.csv')
df = train.copy()

In [3]:
df.head()

In [4]:
# # Split sentences
# text = [sent_tokenize(text=row,language = 'english') for row in df['discourse_text']]

# i = 0
# z=0 

# id_list = []
# text_list = []
# for id in df['discourse_id']:
#   for y in range(len(text[i])):
#     # print(text_dict)
#     id_list.append(id)
#     text_list.append(text[i][y])
#     z+=1
#   i+=1

#   text_dict={'discourse_id':id_list,'discourse_text':text_list}
#   sent_tokenized = pd.DataFrame(text_dict)
#   sent_tokenized.to_csv('/content/drive/My Drive/Colab Notebooks/Data/train_sentence_tokenized.csv')


## Discourse Type
Each essay element contains discourse type metadata. There are 7 discourse_type values with explainations:

`Lead` - an introduction that begins with a statistic, a quotation, a description, or some other device to grab the reader’s attention and point toward the thesis

`Position` - an opinion or conclusion on the main question

`Claim` - a claim that supports the position

`Counterclaim` - a claim that refutes another claim or gives an opposing reason to the position

`Rebuttal` - a claim that refutes a counterclaim

`Evidence` - ideas or examples that support claims, counterclaims, or rebuttals.

`Concluding Statement` - a concluding statement that restates the claims.

# Data Cleaning

## Normalize unicode

In [5]:
# https://www.kaggle.com/competitions/feedback-prize-2021/discussion/313330
def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    return text


In [6]:
df['discourse_text_norm'] = df['discourse_text'].apply(resolve_encodings_and_normalize)

In [7]:
# sample = df.sample(1)['discourse_id'].values[0]
# print(sample)
sample = '451b76cc4b59'
text = df.loc[(df['discourse_id']==sample),'discourse_text'].values[0]
text_norm = df.loc[(df['discourse_id']==sample),'discourse_text_norm'].values[0]
print(f'Discourse_id: {sample}')
print()
text
# print(text_norm)

In [8]:
text_norm

# EDA

In [9]:
text = [len(sent_tokenize(text=row,language = 'english')) for row in df['discourse_text']]
sentence = pd.DataFrame(text)
df['text_num_sentence'] = sentence

In [10]:
# Including special characters
df['text_word_count'] = df['discourse_text'].str.split().apply(len)
df['text_char_count'] = df['discourse_text'].apply(len)

In [11]:
def num_specialchar(row):
    new = re.sub('[\w]+' ,'', row)
    return len(new)

df['text_num_special_char']=df['discourse_text'].apply(num_specialchar)

In [12]:
df.groupby('discourse_effectiveness')['text_num_sentence'].describe()
plt.figure(figsize=(10,10))
sns.histplot(data=df, x ='text_num_sentence')
plt.title('Number of sentence')
plt.xlim(0,15)

In [13]:
df.groupby('discourse_effectiveness')['text_word_count'].describe()
# plt.figure(figsize=(10,10))
# sns.histplot(data=df, x ='text_word_count')
# plt.title('Number of words')
# plt.xlim(0,400)

In [14]:
ineff_word = df.loc[df['discourse_effectiveness']=='Ineffective']
eff_word = df.loc[df['discourse_effectiveness']=='Effective']
adq_word = df.loc[df['discourse_effectiveness']=='Adequate']

figs, axs = plt.subplots(3,1,figsize=(10,10))
# plt.figure(figsize=(10,10))
sns.histplot(data=df, x ='text_word_count',hue='discourse_effectiveness',ax=axs[0],palette = 'GnBu')
sns.histplot(data=eff_word, x ='text_word_count',hue='discourse_effectiveness',ax=axs[1])
sns.histplot(data=adq_word, x ='text_word_count',hue='discourse_effectiveness',ax=axs[2])
plt.title('Number of words')
axs[0].set_xlim(0,250)
axs[1].set_xlim(0,250)
axs[2].set_xlim(0,250)

In [15]:
df.groupby('discourse_effectiveness')['text_char_count'].describe()
plt.figure(figsize=(10,10))
sns.histplot(data=df, x ='text_char_count')
plt.title('Number of characters')
plt.xlim(0,3000)


In [16]:
df.groupby('discourse_effectiveness')['text_num_special_char'].describe()
plt.figure(figsize=(10,10))
sns.histplot(data=df, x ='text_num_special_char')
plt.title('Number of special characters')
plt.xlim(0,600)

In [17]:
effectiveness = df['discourse_effectiveness'].value_counts()
fig = plt.figure(figsize=(15, 7))
plt.title('discourse_effectiveness distribution')
ax = sns.barplot(x=effectiveness.index,
                 y=effectiveness.values)

In [18]:
type_df = df['discourse_type'].value_counts()

fig = plt.figure(figsize=(15, 7))
plt.title('discourse_type distribution')
ax = sns.barplot(x=type_df.index,
                 y=type_df.values,
                palette = 'Pastel1')

In [19]:
df.groupby(['essay_id'])['discourse_type'].nunique()

In [20]:
# df['discourse_type'] = df['discourse_type'].map({'Lead':'1_Lead','Position':'2_Position','Claim':'3_Claim','Counterclaim':'4_Counterclaim','Rebuttal':'5_Rebuttal','Evidence':'6_Evidence','Concluding Statement':'7_Concluding_Statement'})

In [21]:
# df.groupby(['essay_id','discourse_type']).first().head(10)


In [22]:
def cat_analyser(data, col):
    fig, ax = plt.subplots(nrows = 1, ncols = 2, figsize = (18, 6))
    fig.suptitle(col, fontsize = 16)
    sns.countplot(data = data,
                  x = col,
                  ax = ax[0],
                  palette= 'Pastel1',
                  order =  data[col].value_counts().index)
    ax[0].set_xlabel('')
    pie_cmap = plt.get_cmap('Pastel1')
    normalize = lambda x: (x - np.min(x)) / (np.max(x) - np.min(x)) 
    data[col].value_counts().plot.pie(autopct='%1.1f%%',
                                      textprops={'fontsize': 12},
                                      ax=ax[1],
                                      colors = pie_cmap(normalize(data[col].value_counts())))
    ax[1].set_ylabel('')
    plt.show()
    
for col in ['discourse_type', 'discourse_effectiveness']:
    cat_analyser(df, col)
    
fig, ax = plt.subplots(nrows = 1, ncols = 1, figsize=(18, 8))
sns.countplot(data = df,
              x = 'discourse_type',
              hue ='discourse_effectiveness',
              palette = 'Pastel2')
plt.legend(loc = 'best', prop={'size': 14})
plt.title('Discourse Type & Discourse Effectiveness', size = 14)
plt.yticks(size = 14)
plt.xticks(size = 14)
plt.show()

In [23]:
from IPython.core.display import display, HTML

def show_examples_for_discourse_type(discourse_type, topic):
    filt = df.query(f'discourse_type == "{discourse_type}"').sample(frac=1, random_state=420)
    display(HTML(
        f"""
        <h4 style="background:#66ccff ;color: white; font-size: 20px; width:20%; padding: 12px 12px;" >{discourse_type }</h4>
        <table>
            <tr>
              <th style="color:black; font-size: 15px", bgcolor='#4d79ff' width=33%>Ineffective</th>
              <th style="color:black; font-size: 15px", bgcolor='#4d79ff' width=33%>Adequate</th>
              <th style="color:black; font-size: 15px", bgcolor='#4d79ff' width=33%>Effective</th>
            </tr>
            <tr>
              <td>{filt.query("discourse_effectiveness == 'Ineffective'").iloc[0].discourse_text}</td>
              <td>{filt.query("discourse_effectiveness == 'Adequate'").iloc[0].discourse_text}</td>
              <td>{filt.query("discourse_effectiveness == 'Effective'").iloc[0].discourse_text}</td>
            </tr>
        </table>
        """
    ))


for dt in set(df.discourse_type.values):
  show_examples_for_discourse_type(dt, 10)  

In [24]:
df.loc[df['essay_id']=='331CA007D0AD']

In [25]:
from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs
import re

def replace_encoding_with_utf8(error: UnicodeError) -> Tuple[bytes, int]:
    return error.object[error.start : error.end].encode("utf-8"), error.end


def replace_decoding_with_cp1252(error: UnicodeError) -> Tuple[str, int]:
    return error.object[error.start : error.end].decode("cp1252"), error.end

# Register the encoding and decoding error handlers for `utf-8` and `cp1252`.
codecs.register_error("replace_encoding_with_utf8", replace_encoding_with_utf8)
codecs.register_error("replace_decoding_with_cp1252", replace_decoding_with_cp1252)

def resolve_encodings_and_normalize(text: str) -> str:
    """Resolve the encoding problems and normalize the abnormal characters."""
    text = (
        text.encode("raw_unicode_escape")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
        .encode("cp1252", errors="replace_encoding_with_utf8")
        .decode("utf-8", errors="replace_decoding_with_cp1252")
    )
    text = unidecode(text)
    text = re.sub(r"\n", "", text)
    return text

In [26]:
def get_text(ids):
    with open(f'../input/feedback-prize-effectiveness/train/{ids}.txt', 'r') as file: data = file.read()
    return data

def display_sample(essay_id):
#     text = get_text(essay_id)
#     text = resolve_encodings_and_normalize(text)
    # ex = []
    char_pos = 0
    ex = [{"text": '',"ents": [],'title':''}]
    ex2 = [{"text": '',"ents": []}]
    text = ''
    for idx in range(df.loc[df['essay_id']=='331CA007D0AD'].shape[0]):
        
        discourse_text = df[(df.essay_id == essay_id)]['discourse_text'].values[idx]
        discourse_text = resolve_encodings_and_normalize(discourse_text)
        begin = char_pos
        end = begin + len(discourse_text)
        # begin = char_pos
        # end = begin + len(discourse_text)
        discoursetype = df[(df.essay_id == essay_id)]['discourse_type'].values[idx]
        label = df[(df.essay_id == essay_id)]['discourse_effectiveness'].values[idx]

        # ex = [{"text": text[begin:end],"ents": [{"start": begin, "end": end, "label": label}]}]

        ex[0]['ents'].append({"start":begin,
                  "end":end,
                  "label":label + ' - ' + discoursetype})
        ex[0]['title'] = f"Essay ID: {essay_id}"

        # ex = [{"text": text,"ents": [{"start": begin, "end": end, "label": label},{"start": begin1, "end": end1, "label": label1}]}]
            #  "title": f"Essay ID: {essay_id}"}]
        char_pos = end
        text += discourse_text
    ex[0]['text']=text
    ex2[0]['text']=text
    displacy.render(ex, style="ent", manual=True,jupyter=True,options={"distance":100})
    print()
    displacy.render(ex2, style="ent", manual=True,jupyter=True,options={"distance":100})
    return ex

ex = display_sample("331CA007D0AD")

In [27]:
def get_text(ids):
    with open(f'../input/feedback-prize-effectiveness/train/{ids}.txt', 'r') as file: data = file.read()
    return data

def display_sample(essay_id):
#     text = get_text(essay_id)
#     text = resolve_encodings_and_normalize(text)
    # ex = []
    char_pos = 0
    ex = [{"text": '',"ents": []}]
    text = ''
    for idx in range(df.loc[df['essay_id']=='331CA007D0AD'].shape[0]):

        discourse_text = df[(df.essay_id == essay_id)]['discourse_text'].values[idx]
        discourse_text = resolve_encodings_and_normalize(discourse_text)
#         begin = char_pos
#         end = begin + len(discourse_text)

        discoursetype = df[(df.essay_id == essay_id)]['discourse_type'].values[idx]
        label = df[(df.essay_id == essay_id)]['discourse_effectiveness'].values[idx]
        
        ex = [{"text":discourse_text,"ents": [{"start": 0, "end": len(discourse_text), "label": label + ' - ' + discoursetype}]}]
        
        if label =='Ineffective':
            options = {"color":"#FF5050"}
            displacy.render(ex, style="ent", manual=True,jupyter=True, options = options)

        elif label =="Adequate":
            options = {"color":"#FFCC00"}
            displacy.render(ex, style="ent", manual=True,jupyter=True, options = options)

        elif label =='Effective':
            options = {"color":"#33CC33"}
            displacy.render(ex, style="ent", manual=True,jupyter=True, options = options)

#         char_pos = end
#         ex[0]['text']=text
    

    return ex

ex = display_sample("331CA007D0AD")