In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Messages_text.csv')
df.head()

Unnamed: 0,sender_id,message
0,-1001285729190,My Visa experience - 07/08/2021\nApproved✅\nDe...
1,-1001285729190,9th July 2021\nLocation: Hyderabad\nAppointmen...
2,-1001285729190,July 9th\nHyderabad Consulate\nIn time 10:25\n...
3,-1001285729190,9th July 2021\nChennai VAC(July 4) and VI\nApp...
4,-1001285729190,Visa experience - 8/7/21 \nApproved✅\nMumbai c...


## Exploring messages

* Its possible that we have empty cells and some jargon messages where the len of the message is < 145 characters.
* Note: 145 is not a magic number. After observing messages where the character size < 145, i concluded that the information available was irrelavent. Hence the messages with < 145 characters are ignored. 

In [6]:
# Check for Nan values in messages column.
df['message'].isnull().sum()
print(df.shape) # (2397, 2)
# As there are 40 null values, we can drop the rows as they are of no use

df.dropna(inplace=True)
print("Shape of dataframe after dropping nan rows")
print(df.shape)

(2397, 2)
Shape of dataframe after dropping nan rows
(2357, 2)


In [11]:

MESSAGES_LEN_TO_IGNORE = 145

df['length_of_message'] = df['message'].apply(lambda x : len(str(x)))
# Filter out of the rows with message length < 145
df_filter = df[df['length_of_message'] > MESSAGES_LEN_TO_IGNORE]


In [12]:
# The final dataframe after filtering out un-necessary messages 
print(df_filter.shape)

(2301, 3)


## Below attributes will be extracted from messages
* [Extracting Status](#extract_status)
* Date of the Visa interview
* [Extracting location](#extract_location)
* Questions asked in VI
* Duration
* [Extracting University Name](#extract_university)


### Extracting location
<a id='extract_location'></a>


In [28]:
def get_consulate_location(str_to_check):
    known_consulate_locations = ['hyderabad', 'mumbai', 'kolkata', 'delhi', 'chennai', 'hyd', 'bombay', 'malaysia', 'madras']
    str_converted_to_lower = str_to_check.lower()
    for consulate_location in known_consulate_locations:
        if consulate_location in str_converted_to_lower:
            return consulate_location


df_filter['consulate_location'] = df_filter['message'].apply(get_consulate_location)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [32]:
mapping_dict = {'bombay' : "mumbai", 'hyd' : "hyderabad", "madras" : "chennai"}
df_filter['consulate_location'] = df_filter['consulate_location'].apply(lambda x : mapping_dict.get(x) if mapping_dict.get(x) is not None else x )
df_filter['consulate_location'].fillna("NA", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [34]:
print(df_filter.consulate_location.value_counts())
df_filter.to_csv("Test.csv", index=False)

mumbai       912
delhi        534
chennai      340
hyderabad    302
kolkata      158
NA            54
malaysia       1
Name: consulate_location, dtype: int64


### Extracting Status 
<a id='extract_status'></a>


In [35]:
def get_visa_status(message):
    possible_status = ['approved', 'rejected']
    for _status in possible_status:
        if _status in message.lower():
            return _status

df_filter['visa_status'] = df_filter['message'].apply(get_visa_status)
df_filter['visa_status'].fillna("NA", inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [37]:
df_filter['visa_status'].value_counts()

approved    2027
rejected     200
NA            74
Name: visa_status, dtype: int64

### Extracting University Name
<a id='extract_university'></a>


In [69]:
df_unv = pd.read_excel('AccreditationData.xlsx', sheet_name='InstituteCampuses')

def update_parent_data(location_name, parent_name):
    if parent_name == '-':
        return location_name
    else:
        return parent_name

df_unv['UniqueName'] = df_unv.apply(lambda x: update_parent_data(x.LocationName, x.ParentName), axis=1)
unique_university_names = df_unv['UniqueName'].unique()

In [70]:
print(len(unique_university_names))
# There are 10595 unique universities across USA


10595


In [75]:
from fuzzywuzzy import fuzz
import numpy as np
# from tqdm import tqdm
 
# https://www.datacamp.com/community/tutorials/fuzzy-string-python

def get_university_name(message):
    max, max_index = 0, 'na'
    for unv_index, _unv_name in enumerate(unique_university_names):
        str1, str2 = message, _unv_name
        token_set_ratio = fuzz.token_set_ratio(Str1,Str2)
        # token_set_ratio_list.append(token_set_ratio)
        if token_set_ratio > max:
            max = token_set_ratio
            max_index = unv_index
    # index = np.argmax(token_set_ratio_list)
    return unique_university_names[max_index]


df_filter['University Name'] = df_filter['message'].apply(get_university_name)

1<00:09, 913.18it/s][A
 20%|█▉        | 2082/10595 [00:02<00:09, 862.71it/s][A
 20%|██        | 2171/10595 [00:02<00:09, 844.86it/s][A
 21%|██▏       | 2257/10595 [00:02<00:09, 842.24it/s][A
 22%|██▏       | 2343/10595 [00:02<00:10, 819.59it/s][A
 23%|██▎       | 2426/10595 [00:02<00:10, 777.68it/s][A
 24%|██▍       | 2532/10595 [00:02<00:09, 853.80it/s][A
 25%|██▍       | 2630/10595 [00:02<00:08, 887.37it/s][A
 26%|██▌       | 2731/10595 [00:02<00:08, 922.21it/s][A
 27%|██▋       | 2825/10595 [00:02<00:08, 891.64it/s][A
 28%|██▊       | 2937/10595 [00:03<00:08, 953.98it/s][A
 29%|██▊       | 3034/10595 [00:03<00:09, 828.57it/s][A
 29%|██▉       | 3121/10595 [00:03<00:09, 827.78it/s][A
 31%|███       | 3245/10595 [00:03<00:07, 937.84it/s][A
 32%|███▏      | 3370/10595 [00:03<00:07, 1021.63it/s][A
 33%|███▎      | 3475/10595 [00:03<00:08, 866.28it/s] [A
 34%|███▎      | 3568/10595 [00:03<00:08, 843.97it/s][A
 35%|███▍      | 3683/10595 [00:03<00:07, 920.29it/s][A
 36%|

KeyboardInterrupt: 