In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('Messages_text.csv')
df.head()

Unnamed: 0,sender_id,message
0,-1001285729190,My Visa experience - 07/08/2021\nApproved✅\nDe...
1,-1001285729190,9th July 2021\nLocation: Hyderabad\nAppointmen...
2,-1001285729190,July 9th\nHyderabad Consulate\nIn time 10:25\n...
3,-1001285729190,9th July 2021\nChennai VAC(July 4) and VI\nApp...
4,-1001285729190,Visa experience - 8/7/21 \nApproved✅\nMumbai c...


## Exploring messages

* Its possible that we have empty cells and some jargon messages where the len of the message is < 145 characters.
* Note: 145 is not a magic number. After observing messages where the character size < 145, i concluded that the information available was irrelavent. Hence the messages with < 145 characters are ignored. 

In [3]:
# Check for Nan values in messages column.
df['message'].isnull().sum()
print(df.shape) # (2397, 2)
# As there are 40 null values, we can drop the rows as they are of no use

df.dropna(inplace=True)
print("Shape of dataframe after dropping nan rows")
print(df.shape)

(2397, 2)
Shape of dataframe after dropping nan rows
(2357, 2)


In [4]:

MESSAGES_LEN_TO_IGNORE = 145

df['length_of_message'] = df['message'].apply(lambda x : len(str(x)))
# Filter out of the rows with message length < 145
df_filter = df[df['length_of_message'] > MESSAGES_LEN_TO_IGNORE]


In [5]:
# The final dataframe after filtering out un-necessary messages 
print(df_filter.shape)

(2301, 3)


## Below attributes will be extracted from messages
* [Extracting Status](#extract_status)
* [Extracting Visa Interview Date](#extract_interview_date)
* [Extracting location](#extract_location)
* [Extracting Questions asked in VI](#extract_questions)
* [Extracting University Name](#extract_university)
* ~~Duration~~


### Extracting location
<a id='extract_location'></a>


In [6]:
def get_consulate_location(str_to_check):
    known_consulate_locations = ['hyderabad', 'mumbai', 'kolkata', 'delhi', 'chennai', 'hyd', 'bombay', 'malaysia', 'madras']
    str_converted_to_lower = str_to_check.lower()
    for consulate_location in known_consulate_locations:
        if consulate_location in str_converted_to_lower:
            return consulate_location


df_filter['consulate_location'] = df_filter['message'].apply(get_consulate_location)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [7]:
mapping_dict = {'bombay' : "mumbai", 'hyd' : "hyderabad", "madras" : "chennai"}
df_filter['consulate_location'] = df_filter['consulate_location'].apply(lambda x : mapping_dict.get(x) if mapping_dict.get(x) is not None else x )
df_filter['consulate_location'].fillna("NA", inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [8]:
print(df_filter.consulate_location.value_counts())
df_filter.to_csv("Test.csv", index=False)

mumbai       912
delhi        534
chennai      340
hyderabad    302
kolkata      158
NA            54
malaysia       1
Name: consulate_location, dtype: int64


### Extracting Status 
<a id='extract_status'></a>


In [9]:
def get_visa_status(message):
    possible_status = ['approved', 'rejected']
    for _status in possible_status:
        if _status in message.lower():
            return _status

df_filter['visa_status'] = df_filter['message'].apply(get_visa_status)
df_filter['visa_status'].fillna("NA", inplace=True)


In [10]:
df_filter['visa_status'].value_counts()

approved    2027
rejected     200
NA            74
Name: visa_status, dtype: int64

### Extracting Questions
<a id='extract_questions'></a>


In [22]:
questions_start_with = ['what', 'what\'s', 'which', 'who', 'where', 'why', 'when', 'how', 'whose', 'do', 'are', 'will', 'did ']

import re
import string 


def extract_questions(message):
    questions = []
    regex_pattern = " |".join(questions_start_with)
    for _string in message.lower().split("\n"):
        if _string.endswith("?"):
            questions.append(_string)
        else:
            matches = re.findall(regex_pattern, _string.strip())
            if len(matches) > 0:
                split_str = _string.split()
                if ("vi" in split_str[0] or "vo" in split_str[0]):
                    first_word = split_str[1].strip()
                    if first_word in string.punctuation:
                        for i in range(2, len(split_str)):
                            if split_str[i] not in string.punctuation and split_str[i] not in ['vo', 'vi']:
                                first_word = split_str[i]
                                break


                else:
                     first_word = split_str[0]
                if first_word in questions_start_with:
                    questions.append(_string)
    return questions

df_filter['Questions'] = df_filter['message'].apply(extract_questions)
df_filter['Questions'].fillna("NA", inplace=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return self._update_inplace(result)


In [23]:
df_filter.to_csv("Questions_extracted.csv", index=False)

### Extracting University Name
<a id='extract_university'></a>


In [29]:
from multiprocessing import  Pool
from functools import partial
import numpy as np
# Taken from here : https://stackoverflow.com/questions/26784164/pandas-multiprocessing-apply#:~:text=from%20multiprocessing%20import,run_on_subset%2C%20func)%2C%20num_of_processes)

def parallelize(data, func, num_of_processes=4):
    data_split = np.array_split(data, num_of_processes)
    pool = Pool(num_of_processes)
    data = pd.concat(pool.map(func, data_split))
    pool.close()
    pool.join()
    return data


In [23]:
df_unv = pd.read_excel('AccreditationData.xlsx', sheet_name='InstituteCampuses')

def update_parent_data(location_name, parent_name):
    if parent_name == '-':
        return location_name
    else:
        return parent_name

df_unv['UniqueName'] = df_unv.apply(lambda x: update_parent_data(x.LocationName, x.ParentName), axis=1)
unique_university_names = df_unv['UniqueName'].unique()

In [24]:
print(len(unique_university_names))
# There are 10595 unique universities across USA


10595


In [25]:
# matchlist = ['Hospital','University','Institute','School','Academy', 'Unv']
# matchlist_lower_case = [pattern.lower() for pattern in matchlist]
# final_patterns_to_check = matchlist_lower_case + matchlist

# regex_str = """[A-Z][^\\.;\\?\\!]*(\\b{regex_str}\\b)[^\\.;\\?\\!]*""".format(regex_str = "\\b|\\b".join(final_patterns_to_check))
# print(regex_str)

In [30]:


from fuzzywuzzy import fuzz
import time
# https://www.datacamp.com/community/tutorials/fuzzy-string-python
# https://www.geeksforgeeks.org/how-to-do-fuzzy-matching-on-pandas-dataframe-column-using-python/ - Explore more

master_data = []

def get_university_name(df):
    
    for index, row_data in df.iterrows():
        print("Processing {index} row".format(index=index))
        start = time.time()
        row = row_data.to_dict()
        max, max_index = 0, 'na'
        for unv_index, _unv_name in enumerate(unique_university_names):
            str1, str2 = row.get('message'), _unv_name
            token_set_ratio = fuzz.token_set_ratio(str1, str2)
            # token_set_ratio_list.append(token_set_ratio)
            if token_set_ratio > max:
                max = token_set_ratio
                max_index = unv_index
        # index = np.argmax(token_set_ratio_list)
        row['university_name'] = unique_university_names[max_index]
        end = time.time()
        print("Total time taken: {sec} sec".format(sec=str(end-start)))
        master_data.append(row)
df_merge = parallelize(df_filter.head(100), get_university_name)

# df_filter['University Name'] = df_filter['message'].apply(get_university_name)

Processing 25 rowProcessing 50 rowProcessing 0 rowProcessing 75 row



Total time taken: 15.823212385177612 sec
Processing 51 row
Total time taken: 23.171153783798218 sec
Processing 76 row
Total time taken: 27.08977723121643 sec
Processing 26 row
Total time taken: 14.865477085113525 sec
Processing 77 row
Total time taken: 39.20491814613342 sec
Processing 1 row
Total time taken: 21.461410760879517 sec
Processing 27 row
Total time taken: 17.971954345703125 sec
Processing 2 row
Total time taken: 21.624213457107544 sec
Processing 78 row
Total time taken: 46.52960181236267 sec
Processing 52 row
Total time taken: 20.068583965301514 sec
Processing 28 row
Total time taken: 29.28138780593872 sec
Processing 3 row
Total time taken: 35.054041624069214 sec
Processing 79 row
Total time taken: 29.050139904022217 sec
Processing 29 row
Total time taken: 42.27673888206482 sec
Processing 53 row
Total time taken: 14.049971580505371 sec
Processing 80 row
Total time taken: 31.607022523880005 sec
Processing 

KeyboardInterrupt: 

### Extracting Interview Date
<a id='extract_interview_date'></a>


In [52]:
%%time
import datefinder

def extract_date_from_message(message):
    try:
        matches = list(datefinder.find_dates(message))
        return matches[0]
    except Exception as e:
        return 'NA'


df_filter['Visa Interview Date'] = df_filter['message'].apply(extract_date_from_message)
df_filter.to_csv("Dates.csv", index=False)

CPU times: user 17 s, sys: 6.81 ms, total: 17.1 s
Wall time: 17 s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [53]:
%%time
# from datetime import datetime

# greater_than_date = datetime.strptime('2021-07-12', '%Y-%m-%d')
# less_than_date = datetime.strptime('2020-01-01', '%Y-%m-%d')

# def replace_value(visa_interview_date):
#     try:
#         final_vi_date = datetime.strptime(visa_interview_date.split(" ")[0], '%Y-%m-%d')

#         if (final_vi_date > greater_than_date) or (final_vi_date < less_than_date):
#             return "NA"
#         else:
#             return visa_interview_date
#     except Exception as e:
#         return "NA"


# df_filter['Visa Interview Date'] = df_filter['Visa Interview Date'].apply(replace_value)

# df_filter['Visa Interview Date'] = df_filter['Visa Interview Date'].replace(pd.NaT, "NA")
# df_filter['Visa Interview Date'] = pd.to_datetime(df_filter['Visa Interview Date'])

# df_filter.loc[df_filter['Visa Interview Date'] > greater_than_date, "Visa Interview Date"] = "NA"
# df_filter.loc[df_filter['Visa Interview Date'] < less_than_date, "Visa Interview Date"] = "NA"


# def extract_dates_for_failed_messages(message, extracted_date):
#     try:
#         return dateparser.parse(str(extracted_date))
#     except Exception as e:
#         matches = search_dates(message)
#         for match in matches:
#             if today.month and today.year and today.day:
#                 return match

# df_filter['Visa Interview Date'] = df_filter.apply(lambda x : extract_dates_for_failed_messages(x['message'], x['Visa Interview Date']), axis=1)
df_filter.to_csv("Final_Dates.csv", index=False)           


CPU times: user 102 ms, sys: 20.2 ms, total: 122 ms
Wall time: 119 ms


In [13]:
txt = """ "VISA : REJECTED âŒ
Location: New Delhi
Slot time: 10:30
In-time: 9:30AM
Out-time: 10:05 AM
Counter : 21
http://t.me/f1interviewreviews

Troy University

Only four to five counters were open.

Duration: around 2 minutes

VO is Asian American  guy in his 30â€™s

good morning pass me your passport, i20 and sevis fee receipt
good morning sir, how are you  (passed)
i am doing good, Thank you.
place your right hand four finger on scanner.
did 

why usa?
Sir i want to pursue my masterâ€™s in computer science with specialisation in software development
Typing for 30-40 seconds and look towards i-20

what did you do in undergraduate?
i have completed my bachelors in computer science and engineering from charusat university, recently in 2021


how you going to fund?
My parents are going to sponsor me.My father has a savings of 35 lakhs rupees in his savings account which equivalent to 47 thousand USD and apart from that we have a immovable assets worth 1.27 crores which is equivalent to 1.7 lakh USD 

Typing for 20-25 seconds 


Vo: Unfortunately i am not approving your visa.


@f1interviewreviews"

"""
import re
questions = []

regex_pattern = " |".join(questions_start_with)
for _string in txt.lower().split("\n"):
    if _string.endswith("?"):
        questions.append(_string)
    else:
        matches = re.findall(regex_pattern, _string.strip())
        if len(matches) > 0:
            split_str = _string.split()
            first_word = split_str[1] if "vi" in split_str[0] else split_str[0]
            if first_word in questions_start_with:
                questions.append(_string)





why usa?
what did you do in undergraduate?
how you going to fund?


