In [1]:
import pandas as pd
import re
import matplotlib.pyplot as pt
import seaborn as sns

In [7]:
# Loading the datset
df = pd.read_csv("../../data/raw/ProfileData/vivah_profiles_clean.csv")
print("No of Profiles" ,df.shape[0], " & no of features ",df.shape[1])
df.head()

No of Profiles 300  & no of features  8


Unnamed: 0,Profile ID,Religion,Caste,Mother Tongue,"Age, Height",Profession,Education,Location
0,VVZ5841,Hindu,Maratha,English,"26 Yrs, 5ft 4in - 162cm",Scientist,Masters in Medicine,India - Maharashtra - Navi Mumbai
1,VVF8907,Muslim,Hanafi,Urdu,"21 Yrs, 5ft 1in - 154cm",Not Working,Bachelors in Nursing/ Health Sciences,India - Telangana - Hyderabad
2,VVH2947,Hindu,Brahmin Himachali,Hindi,"35 Yrs, 5ft 4in - 162cm",Not Working,Bachelors in Commerce,India - Himachal Pradesh - Shimla
3,VVF8880,Hindu,Koli,Kannada,"30 Yrs, 5ft 4in - 162cm",Doctor,Masters in Medicine,India - Karnataka - Bidar
4,VHQ0512,Hindu,Kuruba,Kannada,"30 Yrs, 5ft 4in - 162cm",Admin/ Front Office,Bachelors in Commerce,India - Karnataka - Bangalore


In [22]:
# In the dataset age and height is given in  "Age, Height" <-- format
# Creating two new column Age and height from it
def normalize_age_height_column(val):
    age_val = re.search(r'(\d+)\s*Yrs',val  ,flags=re.IGNORECASE)
    height_val = re.search(r'(\d+)\s*cm',val,flags=re.IGNORECASE)
    age = int(age_val.group(1)) if age_val else None
    height = int(height_val.group(1)) if height_val else None
    return age,height #Outputs tuple
def add_FeetInchColumnfromcm(cm):
    if pd.isna(cm):
        return None
    total_inches = int(round(cm/2.54))
    feet = total_inches//12
    inch = total_inches%12
    return f"{feet}ft {inch}in"

df[['Age','Height']] = df['Age, Height'].apply(lambda x : pd.Series(normalize_age_height_column(x)))
df["Height_ft_in"] = df['Height'].apply(add_FeetInchColumnfromcm)
df.drop(columns=['Age, Height'], inplace=True)

df.head()


Unnamed: 0,Profile ID,Religion,Caste,Mother Tongue,Profession,Education,Location,Age,Height,Height_ft_in
0,VVZ5841,Hindu,Maratha,English,Scientist,Masters in Medicine,India - Maharashtra - Navi Mumbai,26,162,5ft 4in
1,VVF8907,Muslim,Hanafi,Urdu,Not Working,Bachelors in Nursing/ Health Sciences,India - Telangana - Hyderabad,21,154,5ft 1in
2,VVH2947,Hindu,Brahmin Himachali,Hindi,Not Working,Bachelors in Commerce,India - Himachal Pradesh - Shimla,35,162,5ft 4in
3,VVF8880,Hindu,Koli,Kannada,Doctor,Masters in Medicine,India - Karnataka - Bidar,30,162,5ft 4in
4,VHQ0512,Hindu,Kuruba,Kannada,Admin/ Front Office,Bachelors in Commerce,India - Karnataka - Bangalore,30,162,5ft 4in


In [25]:
df.rename(columns={"Height" : "Height in cm"}, inplace=True)
df.head()

Unnamed: 0,Profile ID,Religion,Caste,Mother Tongue,Profession,Education,Location,Age,Height in cm,Height_ft_in
0,VVZ5841,Hindu,Maratha,English,Scientist,Masters in Medicine,India - Maharashtra - Navi Mumbai,26,162,5ft 4in
1,VVF8907,Muslim,Hanafi,Urdu,Not Working,Bachelors in Nursing/ Health Sciences,India - Telangana - Hyderabad,21,154,5ft 1in
2,VVH2947,Hindu,Brahmin Himachali,Hindi,Not Working,Bachelors in Commerce,India - Himachal Pradesh - Shimla,35,162,5ft 4in
3,VVF8880,Hindu,Koli,Kannada,Doctor,Masters in Medicine,India - Karnataka - Bidar,30,162,5ft 4in
4,VHQ0512,Hindu,Kuruba,Kannada,Admin/ Front Office,Bachelors in Commerce,India - Karnataka - Bangalore,30,162,5ft 4in


In [28]:
# Splitting composite column for locaiton into CITY, STATE, COUNTRY
df[['Country','State','City']] = df['Location'].str.split(" - ", expand=True,n=2)
df.drop(columns=["Location"],inplace=True) 
df.head()

Unnamed: 0,Profile ID,Religion,Caste,Mother Tongue,Profession,Education,Age,Height in cm,Height_ft_in,Country,State,City
0,VVZ5841,Hindu,Maratha,English,Scientist,Masters in Medicine,26,162,5ft 4in,India,Maharashtra,Navi Mumbai
1,VVF8907,Muslim,Hanafi,Urdu,Not Working,Bachelors in Nursing/ Health Sciences,21,154,5ft 1in,India,Telangana,Hyderabad
2,VVH2947,Hindu,Brahmin Himachali,Hindi,Not Working,Bachelors in Commerce,35,162,5ft 4in,India,Himachal Pradesh,Shimla
3,VVF8880,Hindu,Koli,Kannada,Doctor,Masters in Medicine,30,162,5ft 4in,India,Karnataka,Bidar
4,VHQ0512,Hindu,Kuruba,Kannada,Admin/ Front Office,Bachelors in Commerce,30,162,5ft 4in,India,Karnataka,Bangalore


In [29]:
# Converting all string or categorical values into lowercase 
# and removing all whotespace
text_Categorical_cols = ['Religion', 'Caste', 'Mother Tongue', 'Profession', 'Education', 'Country', 'State', 'City']
for column in text_Categorical_cols:
    df[column] = df[column].str.lower().str.strip()

In [32]:
# Handling missing data for both Numerical and categorical value in dataset
df[text_Categorical_cols] = df[text_Categorical_cols].fillna("unknown")
df["Age"] = df["Age"].fillna(df["Age"].median())
df["Height in cm"] = df["Height in cm"].fillna(df["Height in cm"].median())
# df.drop(columns=["Height_cm"],inplace=True)


In [33]:
# Saving the processed data
df.to_csv("processed_vivah_profiles.csv",index=False)

In [2]:
df = pd.read_csv("../../data/processed/ProfileData/processed_vivah_profiles.csv")
print(df.columns)

Index(['Profile ID', 'Religion', 'Caste', 'Mother Tongue', 'Profession',
       'Education', 'Age', 'Height in cm', 'Height_ft_in', 'Country', 'State',
       'City'],
      dtype='object')


In [11]:
print("No of religions considered ", df["Religion"].nunique())

print("These Religions are",df["Religion"].value_counts())


No of religions considered  3
These Religions are Religion
hindu       240
muslim       30
buddhist     30
Name: count, dtype: int64


In [13]:
print("No of Casts considered ", df["Caste"].nunique())

print("These Castes are",df["Caste"].value_counts())

No of Casts considered  10
These Castes are Caste
maratha              30
hanafi               30
brahmin himachali    30
koli                 30
kuruba               30
brahmin              30
buddhist             30
rajput               30
vishwakarma          30
brahmin gour         30
Name: count, dtype: int64


In [12]:
print("Type of Educational Qualifications  considered ", df["Education"].nunique())

print("These Qualifications are",df["Education"].value_counts())

Type of Educational Qualifications  considered  7
These Qualifications are Education
bachelors in commerce                    90
masters in medicine                      60
bachelors in nursing/ health sciences    30
high school in arts                      30
other in fashion                         30
masters in finance                       30
bachelors in arts                        30
Name: count, dtype: int64


In [3]:
print("Unique States:", df['State'].nunique())
print(df['State'].value_counts().head(10))


print("Unique Professions:", df['Profession'].nunique())
print(df['Profession'].value_counts().head(10))

Unique States: 8
State
maharashtra         60
karnataka           60
telangana           30
himachal pradesh    30
delhi               30
uttar pradesh       30
jharkhand           30
odisha              30
Name: count, dtype: int64
Unique Professions: 7
Profession
not working             120
scientist                30
doctor                   30
admin/ front office      30
other                    30
consultant               30
chartered accountant     30
Name: count, dtype: int64
