Lets look at the data

In [58]:
# !pip install openpyxl

In [59]:
import pandas as pd
import config

In [60]:
train_df = pd.read_excel(config.TRAIN_FILE_PATH)
test_df = pd.read_excel(config.TEST_FILE_PATH)

In [61]:
print("Null data: train")
print(train_df.isnull().sum())

print("Null data: test")
print(test_df.isnull().sum())

Null data: train
Sentence     0
Entity       0
Sentiment    0
dtype: int64
Null data: test
Sentence    0
Entity      0
dtype: int64


In [62]:
print("Label Distribution")
print(train_df.Sentiment.value_counts())

Label Distribution
positive    4100
negative    1899
Name: Sentiment, dtype: int64


Not much skewed distribution

In [63]:
print(train_df['Sentence'].str.len().describe())
print(train_df['Entity'].str.len().describe())

count    5999.000000
mean      107.550925
std        64.307248
min        12.000000
25%        64.000000
50%        93.000000
75%       132.000000
max       769.000000
Name: Sentence, dtype: float64
count    5999.000000
mean        7.463577
std         3.300444
min         2.000000
25%         5.000000
50%         7.000000
75%         8.000000
max        31.000000
Name: Entity, dtype: float64


In [64]:
print(test_df['Sentence'].str.len().describe())
print(test_df['Entity'].str.len().describe())

count    1290.000000
mean       93.162791
std        62.909689
min         6.000000
25%        50.000000
50%        79.000000
75%       120.000000
max       583.000000
Name: Sentence, dtype: float64
count    1290.000000
mean        7.648062
std         3.248944
min         2.000000
25%         5.000000
50%         7.000000
75%         9.000000
max        27.000000
Name: Entity, dtype: float64


In [65]:
print(train_df['Sentence'].str.split().str.len().describe())
print(train_df['Entity'].str.split().str.len().describe())

count    5999.000000
mean       19.371729
std        12.355986
min         2.000000
25%        11.000000
50%        17.000000
75%        24.000000
max       143.000000
Name: Sentence, dtype: float64
count    5999.000000
mean        1.128188
std         0.356529
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: Entity, dtype: float64


In [66]:
print(test_df['Sentence'].str.split().str.len().describe())
print(test_df['Entity'].str.split().str.len().describe())

count    1290.000000
mean       16.931008
std        11.770987
min         1.000000
25%         9.000000
50%        14.000000
75%        22.000000
max       107.000000
Name: Sentence, dtype: float64
count    1290.000000
mean        1.152713
std         0.380799
min         1.000000
25%         1.000000
50%         1.000000
75%         1.000000
max         3.000000
Name: Entity, dtype: float64


In [67]:
train_df['entity_counters'] = train_df.apply(lambda x: x['Sentence'].count(f"{x['Entity']}"), axis = 1)
train_df['entity_counters'].value_counts()

1    5747
2     230
3      14
0       8
Name: entity_counters, dtype: int64

In [68]:
test_df['entity_counters'] = test_df.apply(lambda x: x['Sentence'].count(f"{x['Entity']}"), axis = 1)
test_df['entity_counters'].value_counts()

1    1207
0      53
2      29
3       1
Name: entity_counters, dtype: int64

Just small preprocessing will solve this issue. Reducing all to lower will reduce the knowledge we gain from casing of sentences

In [69]:
train_df.iloc[[668], [1]] = 'Companies'
train_df.iloc[[2346], [1]] = 'Customer Services'
train_df.iloc[[2028], [1]] = 'customer services'
train_df.iloc[[3206], [1]] = 'Advisor'
train_df.iloc[[4555], [1]] = 'Transaction'
train_df.iloc[[3315], [1]] = 'Website'
train_df.iloc[[1527], [1]] = 'pet-hate'
train_df.iloc[[2826], [1]] = 'AXA Insurance'

In [70]:
train_df['entity_counters'] = train_df.apply(lambda x: x['Sentence'].count(f"{x['Entity']}"), axis = 1)
train_df['entity_counters'].value_counts()

1    5755
2     230
3      14
Name: entity_counters, dtype: int64

In [71]:
train_df = train_df[train_df['entity_counters'] != 0]
test_df = test_df[test_df['entity_counters'] != 0]

In [72]:
train_df['entity_counters'] = train_df.apply(lambda x: x['Sentence'].count(f"{x['Entity']}"), axis = 1)
train_df['entity_counters'].value_counts()

1    5755
2     230
3      14
Name: entity_counters, dtype: int64

In [73]:
import re
from tqdm.notebook import tqdm

tqdm.pandas()
train_df['entity_counters'] = train_df.progress_apply(lambda x: len(re.findall(r"\b" + re.escape(x['Entity']) + r"\b", x['Sentence'])), axis = 1)
train_df['entity_counters'].value_counts()

  0%|          | 0/5999 [00:00<?, ?it/s]

1    5745
2     195
0      48
3      11
Name: entity_counters, dtype: int64

In [74]:
train_df[train_df['entity_counters'] == 0]

Unnamed: 0,Sentence,Entity,Sentiment,entity_counters
133,I did make a telephone contact and the person ...,phone contact,positive,0
161,I found the form much simpler to fill in than ...,web site,positive,0
265,The call handler was very helpful and explaine...,phone manner,positive,0
283,Car insurance purchased online website was eas...,insurance purchase,positive,0
307,quickvalue for moneybest on car insuranceeasy ...,car insurance,positive,0
421,telephone staff helpful but gave incorrect adv...,phone staff,positive,0
632,As a fall-back no telephone number is given so...,phone number,negative,0
955,Set out in simple terms what insurance covers ...,insurance cover,positive,0
1047,Anthony I believe was the name of the sales re...,sales rep,positive,0
1521,The email address is a complete waste of time ...,mail address,negative,0


In [75]:
tqdm.pandas()
test_df['entity_counters'] = test_df.progress_apply(lambda x: len(re.findall(r"\b" + re.escape(x['Entity']) + r"\b", x['Sentence'])), axis = 1)
test_df['entity_counters'].value_counts()

  0%|          | 0/1237 [00:00<?, ?it/s]

1    1189
2      25
0      23
Name: entity_counters, dtype: int64

Still some issues remaining. Will have to manually clean them.

In [76]:
train_df = train_df[train_df['entity_counters'] != 0]
test_df = test_df[test_df['entity_counters'] != 0]

In [None]:
train_df.to_csv(config.PROCESSED_TRAIN_FILE_PATH)
test_df.to_csv(config.PROCESSED_TEST_FILE_PATH)