### Loading the Datasets & Libraries

In [51]:
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import seaborn as sns
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

import re
import warnings
warnings.filterwarnings('ignore')


In [52]:
df = pd.read_csv("../../data/raw/Roommate_Compatibility_Survey.csv")

### Initial Observations or Trends :

In [53]:
df.head()

Unnamed: 0,Timestamp,Name (optional),Gender,Age,Faculty (الكلية),Religion (الديانة),Are your parents Abroad? (هل والديكم في الخارج؟),Do you live with your parents? (هل تعيش مع والديك؟),Do you smoke? (هل تدخن؟),Do you mind if your roommate smokes? (هل تمانع إذا كان زميلك في السكن يدخن؟),...,I go to bed: (أنام:),I get up: (أستيقظ:),I expect to study: \r\n(أتوقع أن أذاكر:),I plan to study:\r\n(أخطط للدراسة:),When I study: \r\n(عندما أدرس:),I require: \r\n(أحتاج:),What concerns do you have about your future roommate? \r\n(ما هي المخاوف التي لديك حول زميل السكن المستقبلي؟),How often do you prefer your roommate to have guests over? \r\n(كم مرة تفضل أن يستضيف زميل السكن الضيوف؟),"When dealing with conflicts, how do you usually handle them? \r\n(عند التعامل مع الصراعات، كيف عادة ما تتعامل معها؟)",How do you prefer to communicate with your roommate about important matters? (كيف تفضل التواصل مع زميلك في السكن بشأن المسائل الهامة؟)
0,2024/04/25 11:59:16 pm GMT+3,Youssef hatem abd elmasen,Male,21,Computer Science & Engineering (علوم الحاسوب و...,Islam (الإسلام),No (لا),Yes (نعم),No (لا),Yes (نعم),...,Around Midnight (حوالي منتصف الليل),As late as possible (في أقرب وقت ممكن),In my room (في غرفتي),In the evening (في المساء),I don’t have a preference (ليس لدي تفضيل),Some private time (بعض الوقت الخاص),Messy (فوضوي),Rarely (نادرًا),I hint at what bothers me in a joking manner. ...,Face-to-face (وجهًا لوجه)
1,2024/04/25 11:59:28 pm GMT+3,Basmlaa,Female,19,Computer Science & Engineering (علوم الحاسوب و...,Islam (الإسلام),Yes (نعم),No (لا),No (لا),Yes (نعم),...,After Midnight (بعد منتصف الليل),Somewhat Early (باكرًا إلى حد ما),In my room (في غرفتي),In the afternoon (بعد الظهر),I don’t have a preference (ليس لدي تفضيل),A significant amount of private time (كمية كبي...,Not picking up after themselves (عدم التنظيف ب...,Frequently (بشكل متكرر),I am not comfortable addressing a conflict. (ل...,Face-to-face (وجهًا لوجه)
2,2024/04/25 11:59:49 pm GMT+3,Eman elshahat,Female,21,Computer Science & Engineering (علوم الحاسوب و...,Islam (الإسلام),No (لا),Yes (نعم),No (لا),Maybe (ربما),...,After Midnight (بعد منتصف الليل),Somewhat Early (باكرًا إلى حد ما),In my room (في غرفتي),In the evening (في المساء),I need total quiet (أحتاج لهدوء تام),A significant amount of private time (كمية كبي...,Obnoxious behavior (سلوك مزعج);Loud music/TV (...,Often (غالبًا),I am not comfortable addressing a conflict. (ل...,Face-to-face (وجهًا لوجه)
3,2024/04/26 12:00:01 am GMT+3,روان اسامه طبنجات,Female,20,Computer Science & Engineering (علوم الحاسوب و...,Islam (الإسلام),No (لا),Yes (نعم),No (لا),Yes (نعم),...,After Midnight (بعد منتصف الليل),Very Early (باكرًا جدًا),In my room (في غرفتي),Late at night (في وقت متأخر من الليل),I like to have music or other background noise...,Very little private time (وقتًا خاصًا قليلًا ج...,Obnoxious behavior (سلوك مزعج);Not picking up ...,Sometimes (أحيانًا),I am not comfortable addressing a conflict. (ل...,Face-to-face (وجهًا لوجه)
4,2024/04/26 12:00:01 am GMT+3,Mai,Female,19,Computer Science & Engineering (علوم الحاسوب و...,Islam (الإسلام),No (لا),Yes (نعم),No (لا),No (لا),...,After Midnight (بعد منتصف الليل),Late (متأخرًا),Both inside and outside my room (كلاهما داخل و...,Late at night (في وقت متأخر من الليل),I don’t have a preference (ليس لدي تفضيل),Some private time (بعض الوقت الخاص),Obnoxious behavior (سلوك مزعج);Not picking up ...,Rarely (نادرًا),I hint at what bothers me in a joking manner. ...,Face-to-face (وجهًا لوجه)


In [54]:
df.drop(columns=['Timestamp','Name (optional) '],inplace=True)

In [55]:
# Function to remove Arabic text using regex
def remove_arabic(text):
    return re.sub(r'\([^)]*\)', '', text).strip()

# Function to remove Arabic text from both cells and column headers in a DataFrame
def remove_arabic_dataframe(df):
    # Clean column headers
    cleaned_columns = [remove_arabic(column) for column in df.columns]
    
    # Clean data in each cell
    cleaned_data = [[remove_arabic(cell) for cell in row] for row in df.values]
    
    # Create a new DataFrame with cleaned data and headers
    cleaned_df = pd.DataFrame(cleaned_data, columns=cleaned_columns)
    
    return cleaned_df

In [56]:
df = remove_arabic_dataframe(df)

In [57]:
df.head()

Unnamed: 0,Gender,Age,Faculty,Religion,Are your parents Abroad?,Do you live with your parents?,Do you smoke?,Do you mind if your roommate smokes?,What kind of relationship are you looking for in a roommate?,What accurately describes your ideal study environment?,...,I go to bed:,I get up:,I expect to study:,I plan to study:,When I study:,I require:,What concerns do you have about your future roommate?,How often do you prefer your roommate to have guests over?,"When dealing with conflicts, how do you usually handle them?",How do you prefer to communicate with your roommate about important matters?
0,Male,21,Computer Science & Engineering,Islam,No,Yes,No,Yes,I am looking for a roommate who I can peaceful...,I need to study in a very quiet environment.,...,Around Midnight,As late as possible,In my room,In the evening,I don’t have a preference,Some private time,Messy,Rarely,I hint at what bothers me in a joking manner.,Face-to-face
1,Female,19,Computer Science & Engineering,Islam,Yes,No,No,Yes,I am looking for a roommate to do everything with,I need to study in a very quiet environment.,...,After Midnight,Somewhat Early,In my room,In the afternoon,I don’t have a preference,A significant amount of private time,Not picking up after themselves ;Lack of respe...,Frequently,I am not comfortable addressing a conflict.,Face-to-face
2,Female,21,Computer Science & Engineering,Islam,No,Yes,No,Maybe,I am looking for a roommate who I can peaceful...,I need to study in a very quiet environment.,...,After Midnight,Somewhat Early,In my room,In the evening,I need total quiet,A significant amount of private time,Obnoxious behavior ;Loud music/TV ;Not picking...,Often,I am not comfortable addressing a conflict.,Face-to-face
3,Female,20,Computer Science & Engineering,Islam,No,Yes,No,Yes,I am looking for a roommate who I can peaceful...,I need to study in a very quiet environment.,...,After Midnight,Very Early,In my room,Late at night,I like to have music or other background noise,Very little private time,Obnoxious behavior ;Not picking up after thems...,Sometimes,I am not comfortable addressing a conflict.,Face-to-face
4,Female,19,Computer Science & Engineering,Islam,No,Yes,No,No,I am looking for a roommate to do everything with,I need to study in a very quiet environment.,...,After Midnight,Late,Both inside and outside my room,Late at night,I don’t have a preference,Some private time,Obnoxious behavior ;Not picking up after thems...,Rarely,I hint at what bothers me in a joking manner.,Face-to-face


In [58]:
# Checking the number of rows and columns

num_rows, num_columns = df.shape

print(f"Number of Rows: {num_rows}")
print(f"Number of Columns: {num_columns}\n")

Number of Rows: 256
Number of Columns: 23



In [59]:
# Creating a table for missing values, unique values and data types of the features

missing_values = pd.DataFrame({'Feature': df.columns,
                              'No. of Missing Values': df.isnull().sum().values,
                              '% of Missing Values': ((df.isnull().sum().values)/len(df)*100)})

unique_values = pd.DataFrame({'Feature': df.columns,
                              'No. of Unique Values': df.nunique().values})

feature_types = pd.DataFrame({'Feature': df.columns,
                              'DataType': df.dtypes})

merged_df = pd.merge(missing_values, unique_values, on='Feature', how='left')
merged_df = pd.merge(merged_df, feature_types, on='Feature', how='left')

merged_df

Unnamed: 0,Feature,No. of Missing Values,% of Missing Values,No. of Unique Values,DataType
0,Gender,0,0.0,2,object
1,Age,0,0.0,8,object
2,Faculty,0,0.0,10,object
3,Religion,0,0.0,2,object
4,Are your parents Abroad?,0,0.0,2,object
5,Do you live with your parents?,0,0.0,2,object
6,Do you smoke?,0,0.0,3,object
7,Do you mind if your roommate smokes?,0,0.0,3,object
8,What kind of relationship are you looking for ...,0,0.0,2,object
9,What accurately describes your ideal study env...,0,0.0,3,object


In [60]:
df.describe().T

Unnamed: 0,count,unique,top,freq
Gender,256,2,Female,141
Age,256,8,19,85
Faculty,256,10,Computer Science & Engineering,92
Religion,256,2,Islam,251
Are your parents Abroad?,256,2,No,157
Do you live with your parents?,256,2,Yes,182
Do you smoke?,256,3,No,244
Do you mind if your roommate smokes?,256,3,Yes,169
What kind of relationship are you looking for in a roommate?,256,2,I am looking for a roommate who I can peaceful...,183
What accurately describes your ideal study environment?,256,3,I need to study in a very quiet environment.,194


In [61]:
# Count duplicate rows in train_data
duplicates = df.duplicated().sum()

# Print the results
print(f"Number of duplicate rows : {duplicates}")

Number of duplicate rows : 0


In [62]:
df.rename(columns={
    'Gender': 'Gender',
    'Age': 'Age',
    'Faculty': 'Faculty',
    'Religion': 'Religion',
    'Are your parents Abroad?': 'Parents_Abroad',
    'Do you live with your parents?': 'Living_with_Parents',
    'Do you smoke?': 'Do_you_smoke',
    'Do you mind if your roommate smokes?': 'Attitude_towards_Roommate_Smoking',
    'What kind of relationship are you looking for in a roommate?': 'Preferred_Roommate_Relationship_Type',
    'What accurately describes your ideal study environment?': 'Ideal_Study_Environment_Description',
    'How do you feel about borrowing/sharing items?': 'Attitude_towards_Borrowing_Sharing',
    'I would describe my room at home as:': 'Description_of_Personal_Room_At_Home',
    'I want my room to be:': 'Desired_Room_Attributes',
    'I go to bed:': 'Bedtime_Preference',
    'I get up:': 'Wake_Up_Time_Preference',
    'I expect to study:': 'Expected_Study_Time_Preference',
    'I plan to study:': 'Planned_Study_Time_Preference',
    'When I study:': 'Study_Time_Preference',
    'I require:': 'Private_Time_Requirements',
    'What concerns do you have about your future roommate?': 'Concerns_about_Future_Roommate',
    'How often do you prefer your roommate to have guests over?': 'Guest_Frequency_Preference',
    'When dealing with conflicts, how do you usually handle them?': 'Conflict_Handling_Method',
    'How do you prefer to communicate with your roommate about important matters?': 'Communication_Preference_with_Roommate'

}, inplace=True)

In [63]:
df.head()

Unnamed: 0,Gender,Age,Faculty,Religion,Parents_Abroad,Living_with_Parents,Do_you_smoke,Attitude_towards_Roommate_Smoking,Preferred_Roommate_Relationship_Type,Ideal_Study_Environment_Description,...,Bedtime_Preference,Wake_Up_Time_Preference,Expected_Study_Time_Preference,Planned_Study_Time_Preference,Study_Time_Preference,Private_Time_Requirements,Concerns_about_Future_Roommate,Guest_Frequency_Preference,Conflict_Handling_Method,Communication_Preference_with_Roommate
0,Male,21,Computer Science & Engineering,Islam,No,Yes,No,Yes,I am looking for a roommate who I can peaceful...,I need to study in a very quiet environment.,...,Around Midnight,As late as possible,In my room,In the evening,I don’t have a preference,Some private time,Messy,Rarely,I hint at what bothers me in a joking manner.,Face-to-face
1,Female,19,Computer Science & Engineering,Islam,Yes,No,No,Yes,I am looking for a roommate to do everything with,I need to study in a very quiet environment.,...,After Midnight,Somewhat Early,In my room,In the afternoon,I don’t have a preference,A significant amount of private time,Not picking up after themselves ;Lack of respe...,Frequently,I am not comfortable addressing a conflict.,Face-to-face
2,Female,21,Computer Science & Engineering,Islam,No,Yes,No,Maybe,I am looking for a roommate who I can peaceful...,I need to study in a very quiet environment.,...,After Midnight,Somewhat Early,In my room,In the evening,I need total quiet,A significant amount of private time,Obnoxious behavior ;Loud music/TV ;Not picking...,Often,I am not comfortable addressing a conflict.,Face-to-face
3,Female,20,Computer Science & Engineering,Islam,No,Yes,No,Yes,I am looking for a roommate who I can peaceful...,I need to study in a very quiet environment.,...,After Midnight,Very Early,In my room,Late at night,I like to have music or other background noise,Very little private time,Obnoxious behavior ;Not picking up after thems...,Sometimes,I am not comfortable addressing a conflict.,Face-to-face
4,Female,19,Computer Science & Engineering,Islam,No,Yes,No,No,I am looking for a roommate to do everything with,I need to study in a very quiet environment.,...,After Midnight,Late,Both inside and outside my room,Late at night,I don’t have a preference,Some private time,Obnoxious behavior ;Not picking up after thems...,Rarely,I hint at what bothers me in a joking manner.,Face-to-face


In [64]:
# Checking the number of rows and columns

num_rows, num_columns = df.shape

print(f"Number of Rows: {num_rows}")
print(f"Number of Columns: {num_columns}\n")

Number of Rows: 256
Number of Columns: 23



In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 256 entries, 0 to 255
Data columns (total 23 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   Gender                                  256 non-null    object
 1   Age                                     256 non-null    object
 2   Faculty                                 256 non-null    object
 3   Religion                                256 non-null    object
 4   Parents_Abroad                          256 non-null    object
 5   Living_with_Parents                     256 non-null    object
 6   Do_you_smoke                            256 non-null    object
 7   Attitude_towards_Roommate_Smoking       256 non-null    object
 8   Preferred_Roommate_Relationship_Type    256 non-null    object
 9   Ideal_Study_Environment_Description     256 non-null    object
 10  Attitude_towards_Borrowing_Sharing      256 non-null    object
 11  Descri

In [66]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column: {column}")
    print(unique_values)
    print()

Column: Gender
['Male' 'Female']

Column: Age
['21' '19' '20' '22' '18' '23' '17' '> 23']

Column: Faculty
['Computer Science & Engineering' 'Medicine' 'Textile Science Engineering'
 'Business' 'Science' 'Engineering' 'Dentistry' 'Pharmacy'
 'Nursing Sciences' 'Applied Health Sciences Technology']

Column: Religion
['Islam' 'Christianity']

Column: Parents_Abroad
['No' 'Yes']

Column: Living_with_Parents
['Yes' 'No']

Column: Do_you_smoke
['No' 'Yes' 'Sometimes']

Column: Attitude_towards_Roommate_Smoking
['Yes' 'Maybe' 'No']

Column: Preferred_Roommate_Relationship_Type
['I am looking for a roommate who I can peacefully coexist with.'
 'I am looking for a roommate to do everything with']

Column: Ideal_Study_Environment_Description
['I need to study in a very quiet environment.'
 'My ideal study environment has some background noise.'
 'High noise levels do not disturb me while studying.']

Column: Attitude_towards_Borrowing_Sharing
['I am fine with sharing items, but my roommate has 

In [67]:
# Dictionary mapping old values to new values for each column
rename_mapping = {
    'Faculty': {
        'Computer Science & Engineering': 'Computer Science',
        'Textile Science Engineering': 'Textile Engineering',
        'Business': 'Business',
        'Science': 'Science',
        'Engineering': 'Engineering',
        'Dentistry': 'Dentistry',
        'Pharmacy': 'Pharmacy',
        'Nursing Sciences': 'Nursing',
        'Applied Health Sciences Technology': 'Health Sciences'
    },
    'Preferred_Roommate_Relationship_Type': {
        'I am looking for a roommate who I can peacefully coexist with.': 'Peaceful coexistence',
        'I am looking for a roommate to do everything with': 'Close companionship'
    },
    'Ideal_Study_Environment_Description': {
        'I need to study in a very quiet environment.': 'Very quiet',
        'My ideal study environment has some background noise.': 'Background noise',
        'High noise levels do not disturb me while studying.': 'High noise tolerance'
    },
    'Attitude_towards_Borrowing_Sharing': {
        'I am fine with sharing items, but my roommate has to ask before using something that is mine.': 'Ask before sharing',
        'I do not like sharing items and would prefer if my roommate used her/his own items only.': 'Prefer personal items',
        'I am fine with sharing items and my roommate can just use it without asking.': 'Share freely'
    },
    'Description_of_Personal_Room_At_Home': {
        'Always clean and organized': 'Clean and organized',
        'Fairly neat and clean': 'Neat',
        'Disaster Area': 'Disorganized',
        'Cluttered': 'Cluttered'
    },
    'Desired_Room_Attributes': {
        'A combination of social and quiet': 'Social & quiet',
        'Quiet and study oriented': 'Study oriented',
        'A social gathering place for friends to hang out': 'Social gathering'
    },
    'Bedtime_Preference': {
        'Around Midnight': 'Around midnight',
        'After Midnight': 'After midnight',
        'Before midnight': 'Before midnight',
        'Usually right at': 'at any time'
    },
    'Wake_Up_Time_Preference': {
        'As late as possible': 'As late as possible',
        'Somewhat Early': 'Somewhat early',
        'Very Early': 'Very early',
        'Late': 'Late',
        'Usually right at': 'at any time'
    },
    'Expected_Study_Time_Preference': {
        'In my room': 'In room',
        'Both inside and outside my room': 'Both',
        'Outside my room': 'Outside room'
    },
    'Planned_Study_Time_Preference': {
        'In the evening': 'Evening',
        'In the afternoon': 'Afternoon',
        'Late at night': 'Late night',
        'In the morning': 'Morning'
    },
    'Study_Time_Preference': {
        'I don’t have a preference': 'No preference',
        'I need total quiet': 'Total quiet',
        'I like to have music or other background noise': 'Background noise'
    },
    'Private_Time_Requirements': {
        'Some private time': 'Some',
        'A significant amount of private time': 'Significant',
        'Very little private time': 'Very little'
    },
    'Conflict_Handling_Method': {
        'I hint at what bothers me in a joking manner.': 'Hint jokingly',
        'I am not comfortable addressing a conflict.': 'Avoid conflict',
        'I address issues in a blunt manner.': 'Blunt',
        'I am able to express my feelings and concerns in a calm manner.': 'Calm'
    }
}

# Apply renaming using replace method in pandas
df.replace(rename_mapping, inplace=True)

In [68]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f"Column: {column}")
    print(unique_values)
    print()

Column: Gender
['Male' 'Female']

Column: Age
['21' '19' '20' '22' '18' '23' '17' '> 23']

Column: Faculty
['Computer Science' 'Medicine' 'Textile Engineering' 'Business' 'Science'
 'Engineering' 'Dentistry' 'Pharmacy' 'Nursing' 'Health Sciences']

Column: Religion
['Islam' 'Christianity']

Column: Parents_Abroad
['No' 'Yes']

Column: Living_with_Parents
['Yes' 'No']

Column: Do_you_smoke
['No' 'Yes' 'Sometimes']

Column: Attitude_towards_Roommate_Smoking
['Yes' 'Maybe' 'No']

Column: Preferred_Roommate_Relationship_Type
['Peaceful coexistence' 'Close companionship']

Column: Ideal_Study_Environment_Description
['Very quiet' 'Background noise' 'High noise tolerance']

Column: Attitude_towards_Borrowing_Sharing
['Ask before sharing' 'Prefer personal items' 'Share freely']

Column: Description_of_Personal_Room_At_Home
['Clean and organized' 'Neat' 'Disorganized' 'Cluttered']

Column: Desired_Room_Attributes
['Social & quiet' 'Study oriented' 'Social gathering']

Column: Bedtime_Preferenc

##### Age (convert it to categorical) 

In [69]:
age_counts = df['Age'].value_counts()
print("Count of each unique value in the 'Age' column:")
print(age_counts)

Count of each unique value in the 'Age' column:
Age
19      85
20      71
18      43
21      37
> 23    10
22       5
23       3
17       2
Name: count, dtype: int64


In [70]:
df['Age'] = df['Age'].replace(['> 23', '23'], '23')

In [71]:
age_counts = df['Age'].value_counts()
print("Count of each unique value in the 'Age' column:")
print(age_counts)

Count of each unique value in the 'Age' column:
Age
19    85
20    71
18    43
21    37
23    13
22     5
17     2
Name: count, dtype: int64


In [72]:
df.head()

Unnamed: 0,Gender,Age,Faculty,Religion,Parents_Abroad,Living_with_Parents,Do_you_smoke,Attitude_towards_Roommate_Smoking,Preferred_Roommate_Relationship_Type,Ideal_Study_Environment_Description,...,Bedtime_Preference,Wake_Up_Time_Preference,Expected_Study_Time_Preference,Planned_Study_Time_Preference,Study_Time_Preference,Private_Time_Requirements,Concerns_about_Future_Roommate,Guest_Frequency_Preference,Conflict_Handling_Method,Communication_Preference_with_Roommate
0,Male,21,Computer Science,Islam,No,Yes,No,Yes,Peaceful coexistence,Very quiet,...,Around midnight,As late as possible,In room,Evening,No preference,Some,Messy,Rarely,Hint jokingly,Face-to-face
1,Female,19,Computer Science,Islam,Yes,No,No,Yes,Close companionship,Very quiet,...,After midnight,Somewhat early,In room,Afternoon,No preference,Significant,Not picking up after themselves ;Lack of respe...,Frequently,Avoid conflict,Face-to-face
2,Female,21,Computer Science,Islam,No,Yes,No,Maybe,Peaceful coexistence,Very quiet,...,After midnight,Somewhat early,In room,Evening,Total quiet,Significant,Obnoxious behavior ;Loud music/TV ;Not picking...,Often,Avoid conflict,Face-to-face
3,Female,20,Computer Science,Islam,No,Yes,No,Yes,Peaceful coexistence,Very quiet,...,After midnight,Very early,In room,Late night,Background noise,Very little,Obnoxious behavior ;Not picking up after thems...,Sometimes,Avoid conflict,Face-to-face
4,Female,19,Computer Science,Islam,No,Yes,No,No,Close companionship,Very quiet,...,After midnight,Late,Both,Late night,No preference,Some,Obnoxious behavior ;Not picking up after thems...,Rarely,Hint jokingly,Face-to-face


#### feauture with multiple answers (Concerns_about_Future_Roommate) 

In [73]:
concerns_list = [
    'Obnoxious behavior',
    'Loud music/TV',
    'Being loud when I’m studying/sleeping',
    'Eating my food',
    'Not picking up after themselves',
    'Being unfriendly',
    'Lack of respect for me and my privacy',
    'Not being open-minded',
    'Bad personal hygiene',
    'Using my things without asking',
    'Having guests over too much',
    'Messy'
]

encoded_df = pd.DataFrame()
# Create a new column for each concern and encode as yes if concern is present, no otherwise
for concern in concerns_list:
    encoded_df[concern] = df['Concerns_about_Future_Roommate'].apply(lambda x: "1" if concern in x else "0")

# Rename columns with a prefix
prefix = 'Concern_'
encoded_df.rename(columns={col: prefix + col for col in concerns_list}, inplace=True)


encoded_df.head()

Unnamed: 0,Concern_Obnoxious behavior,Concern_Loud music/TV,Concern_Being loud when I’m studying/sleeping,Concern_Eating my food,Concern_Not picking up after themselves,Concern_Being unfriendly,Concern_Lack of respect for me and my privacy,Concern_Not being open-minded,Concern_Bad personal hygiene,Concern_Using my things without asking,Concern_Having guests over too much,Concern_Messy
0,0,0,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,1,0,1,0,1,1,0,0
2,1,1,0,0,1,0,1,0,0,0,0,0
3,1,0,0,0,1,1,1,0,0,0,0,0
4,1,0,0,0,1,0,1,0,0,0,0,1


In [74]:
# Define the order for each ordinal column
ordering = {
    'Bedtime_Preference': ['Before midnight', 'Around midnight', 'at any time', 'After midnight'],
    'Wake_Up_Time_Preference': ['Very early', 'Somewhat early', 'at any time', 'Late', 'As late as possible'],
    'Planned_Study_Time_Preference': ['Morning', 'Afternoon', 'Evening', 'Late night'],
    'Private_Time_Requirements': ['Very little', 'Some', 'Significant'],
    'Guest_Frequency_Preference': ['Never', 'Rarely', 'Sometimes', 'Often', 'Frequently'],
}

columns_to_encode = list(ordering.keys())

# Initialize OrdinalEncoder with predefined categories
ordinal_encoder = OrdinalEncoder(categories=[ordering[col] for col in columns_to_encode])

# Apply OrdinalEncoder to the specified columns in df
encoded_columns = ordinal_encoder.fit_transform(df[columns_to_encode])

# Create a DataFrame from the encoded columns
encoded_columns_df = pd.DataFrame(encoded_columns, columns=columns_to_encode, index=encoded_df.index)

# Update encoded_df with the encoded columns
encoded_df[columns_to_encode] = encoded_columns_df

In [75]:
encoded_df.head()

Unnamed: 0,Concern_Obnoxious behavior,Concern_Loud music/TV,Concern_Being loud when I’m studying/sleeping,Concern_Eating my food,Concern_Not picking up after themselves,Concern_Being unfriendly,Concern_Lack of respect for me and my privacy,Concern_Not being open-minded,Concern_Bad personal hygiene,Concern_Using my things without asking,Concern_Having guests over too much,Concern_Messy,Bedtime_Preference,Wake_Up_Time_Preference,Planned_Study_Time_Preference,Private_Time_Requirements,Guest_Frequency_Preference
0,0,0,0,0,0,0,0,0,0,0,0,1,1.0,4.0,2.0,1.0,1.0
1,0,0,0,0,1,0,1,0,1,1,0,0,3.0,1.0,1.0,2.0,4.0
2,1,1,0,0,1,0,1,0,0,0,0,0,3.0,1.0,2.0,2.0,3.0
3,1,0,0,0,1,1,1,0,0,0,0,0,3.0,0.0,3.0,0.0,2.0
4,1,0,0,0,1,0,1,0,0,0,0,1,3.0,3.0,3.0,1.0,1.0


#### nominal feautures encoding

In [76]:
# Get the columns that are not in ordering.keys()
nominal = [col for col in df.columns if col not in ordering.keys() and col not in 'Concerns_about_Future_Roommate' and col not in 'Age' ]
nominal


['Gender',
 'Faculty',
 'Religion',
 'Parents_Abroad',
 'Living_with_Parents',
 'Do_you_smoke',
 'Attitude_towards_Roommate_Smoking',
 'Preferred_Roommate_Relationship_Type',
 'Ideal_Study_Environment_Description',
 'Attitude_towards_Borrowing_Sharing',
 'Description_of_Personal_Room_At_Home',
 'Desired_Room_Attributes',
 'Expected_Study_Time_Preference',
 'Study_Time_Preference',
 'Conflict_Handling_Method',
 'Communication_Preference_with_Roommate']

In [77]:
for column in nominal:
    unique_values = df[column].unique()
    print(f"Column: {column}")
    print(unique_values)
    print()

Column: Gender
['Male' 'Female']

Column: Faculty
['Computer Science' 'Medicine' 'Textile Engineering' 'Business' 'Science'
 'Engineering' 'Dentistry' 'Pharmacy' 'Nursing' 'Health Sciences']

Column: Religion
['Islam' 'Christianity']

Column: Parents_Abroad
['No' 'Yes']

Column: Living_with_Parents
['Yes' 'No']

Column: Do_you_smoke
['No' 'Yes' 'Sometimes']

Column: Attitude_towards_Roommate_Smoking
['Yes' 'Maybe' 'No']

Column: Preferred_Roommate_Relationship_Type
['Peaceful coexistence' 'Close companionship']

Column: Ideal_Study_Environment_Description
['Very quiet' 'Background noise' 'High noise tolerance']

Column: Attitude_towards_Borrowing_Sharing
['Ask before sharing' 'Prefer personal items' 'Share freely']

Column: Description_of_Personal_Room_At_Home
['Clean and organized' 'Neat' 'Disorganized' 'Cluttered']

Column: Desired_Room_Attributes
['Social & quiet' 'Study oriented' 'Social gathering']

Column: Expected_Study_Time_Preference
['In room' 'Both' 'Outside room']

Column: 

#### Label encode nominal data

In [78]:
label_encoders = {}
for col in nominal:
    encoder = LabelEncoder()
    encoded_df[col + '_encoded'] = encoder.fit_transform(df[col])
    label_encoders[col] = encoder


In [81]:
# Save ordinal encoder mappings
ordinal_mappings = {col: dict(zip(categories, ordinal_encoder.categories_[i])) for i, (col, categories) in enumerate(ordering.items())}

# Save nominal encoder mappings
nominal_mappings = {col: dict(zip(label_encoders[col].classes_, label_encoders[col].transform(label_encoders[col].classes_))) for col in nominal}

# Display the encoded dataframe
print("Ordinal Mappings:", ordinal_mappings)
print("Nominal Mappings:", nominal_mappings)

Ordinal Mappings: {'Bedtime_Preference': {'Before midnight': 'Before midnight', 'Around midnight': 'Around midnight', 'at any time': 'at any time', 'After midnight': 'After midnight'}, 'Wake_Up_Time_Preference': {'Very early': 'Very early', 'Somewhat early': 'Somewhat early', 'at any time': 'at any time', 'Late': 'Late', 'As late as possible': 'As late as possible'}, 'Planned_Study_Time_Preference': {'Morning': 'Morning', 'Afternoon': 'Afternoon', 'Evening': 'Evening', 'Late night': 'Late night'}, 'Private_Time_Requirements': {'Very little': 'Very little', 'Some': 'Some', 'Significant': 'Significant'}, 'Guest_Frequency_Preference': {'Never': 'Never', 'Rarely': 'Rarely', 'Sometimes': 'Sometimes', 'Often': 'Often', 'Frequently': 'Frequently'}}
Nominal Mappings: {'Gender': {'Female': np.int64(0), 'Male': np.int64(1)}, 'Faculty': {'Business': np.int64(0), 'Computer Science': np.int64(1), 'Dentistry': np.int64(2), 'Engineering': np.int64(3), 'Health Sciences': np.int64(4), 'Medicine': np.in

In [90]:
import numpy as np

# Define ordinal and nominal mappings (including 'Age_normalized' as a placeholder)
ordinal_mappings = {
    'Bedtime_Preference': {'Before midnight': 0, 'Around midnight': 1, 'at any time': 2, 'After midnight': 3},
    'Wake_Up_Time_Preference': {'Very early': 0, 'Somewhat early': 1, 'at any time': 2, 'Late': 3, 'As late as possible': 4},
    'Planned_Study_Time': {'Morning': 0, 'Afternoon': 1, 'Evening': 2, 'Late night': 3},
    'Private_Time_Requirements': {'Very little': 0, 'Some': 1, 'Significant': 2},
    'Guest_Frequency_Preference': {'Never': 0, 'Rarely': 1, 'Sometimes': 2, 'Often': 3, 'Frequently': 4},
    'Ideal_Study_Environment_Description_encoded': {'Background noise': 0, 'High noise tolerance': 1, 'Very quiet': 2},
    'Attitude_towards_Borrowing_Sharing': {'Ask before sharing': 0, 'Prefer personal items': 1, 'Share freely': 2},
    'Description_of_Personal_Room_At_Home_encoded': {'Clean and organized': 0, 'Cluttered': 1, 'Disorganized': 2, 'Neat': 3},
    'Desired_Room_Attributes_encoded': {'Social & quiet': 0, 'Social gathering': 1, 'Study oriented': 2},
    'Study_Time_Preference': {'Background noise': 0, 'No preference': 1, 'Total quiet': 2},
    'Conflict_Handling_Method_encoded': {'Avoid conflict': 0, 'Blunt': 1, 'Calm': 2, 'Hint jokingly': 3},
    'Communication_Preference_with_Roommate': {'Face-to-face': 0, 'Notes/letters': 1, 'Text or messaging apps': 2}
}

nominal_mappings = {
    'Faculty': {'Business': 0, 'Computer Science': 1, 'Dentistry': 2, 'Engineering': 3, 'Health Sciences': 4,
                'Medicine': 5, 'Nursing': 6, 'Pharmacy': 7, 'Science': 8, 'Textile Engineering': 9},
    'Attitude_towards_Roommate_Smoking': {'Maybe': 0, 'No': 1, 'Yes': 2},
}

def standardize_age(age):
    mean_age = 19.64453125  # Replace with the actual mean from your dataset
    std_age = 1.27825301087223  # Replace with the actual standard deviation from your dataset

    normalized_age = (age - mean_age) / std_age
    return normalized_age

# Function to encode values based on mappings for selected features
def encode_selected_features(features, values, ordinal_mappings, nominal_mappings):
    encoded_values = []
    
    for feature, value in zip(features, values):
        if feature in ordinal_mappings:
            mapping = ordinal_mappings[feature]
            if value in mapping:
                encoded_value = mapping[value]
                encoded_values.append(encoded_value)
            else:
                encoded_values.append(None)  # Handle unknown values if needed
        elif feature in nominal_mappings:
            mapping = nominal_mappings[feature]
            if value in mapping:
                encoded_value = mapping[value]
                encoded_values.append(encoded_value)
            else:
                encoded_values.append(None)  # Handle unknown values if needed
        elif feature == 'Age':
            normalized_age_value = standardize_age(value)
            encoded_values.append(normalized_age_value)  # Age_normalized is already normalized
        else:
            encoded_values.append(None)  # Handle unknown features if needed
    
    return encoded_values

# Example data to encode (subset of features)
features_subset = [
    'Bedtime_Preference', 'Wake_Up_Time_Preference', 'Planned_Study_Time', 'Private_Time_Requirements',
    'Guest_Frequency_Preference', 'Faculty', 'Attitude_towards_Roommate_Smoking',
    'Ideal_Study_Environment_Description_encoded', 'Attitude_towards_Borrowing_Sharing',
    'Description_of_Personal_Room_At_Home_encoded', 'Desired_Room_Attributes_encoded',
    'Study_Time_Preference', 'Conflict_Handling_Method_encoded', 'Communication_Preference_with_Roommate',
    'Age'
]
values_subset = [
    'Around midnight', 'As late as possible', 'Evening', 'Some', 'Rarely', 'Computer Science', 'Yes',
    'Very quiet', 'Ask before sharing', 'Clean and organized', 'Social & quiet', 'No preference', 'Hint jokingly',
    'Face-to-face', 21
]

# Encode the subset of values
encoded_values_subset = encode_selected_features(features_subset, values_subset, ordinal_mappings, nominal_mappings)

# Display the encoded values
print("Encoded values for subset of features:", encoded_values_subset)


Encoded values for subset of features: [1, 4, 2, 1, 1, 1, 2, 2, 0, 0, 0, 1, 3, 0, 1.0604072421273476]


In [87]:
df['Age'] = df['Age'].astype(int)

mean_age = df['Age'].mean()
std_age = df['Age'].std()

print(mean_age)
print(std_age)

19.64453125
1.27825301087223


#### Normalize Age

In [66]:
scaler = StandardScaler()
encoded_df['Age_normalized'] = scaler.fit_transform(df[['Age']])

In [67]:
encoded_df.columns

Index(['Concern_Obnoxious behavior', 'Concern_Loud music/TV',
       'Concern_Being loud when I’m studying/sleeping',
       'Concern_Eating my food', 'Concern_Not picking up after themselves',
       'Concern_Being unfriendly',
       'Concern_Lack of respect for me and my privacy',
       'Concern_Not being open-minded', 'Concern_Bad personal hygiene',
       'Concern_Using my things without asking',
       'Concern_Having guests over too much', 'Concern_Messy',
       'Bedtime_Preference', 'Wake_Up_Time_Preference',
       'Planned_Study_Time_Preference', 'Private_Time_Requirements',
       'Guest_Frequency_Preference', 'Gender_encoded', 'Faculty_encoded',
       'Religion_encoded', 'Parents_Abroad_encoded',
       'Living_with_Parents_encoded', 'Do_you_smoke_encoded',
       'Attitude_towards_Roommate_Smoking_encoded',
       'Preferred_Roommate_Relationship_Type_encoded',
       'Ideal_Study_Environment_Description_encoded',
       'Attitude_towards_Borrowing_Sharing_encoded',
   

In [68]:
import pandas as pd

# Assuming your DataFrame is named df
new_column_names = {
    'Concern_Obnoxious behavior': 'Concern_Obnoxious',
    'Concern_Loud music/TV': 'Concern_LoudMusicTV',
    'Concern_Being loud when I’m studying/sleeping': 'Concern_LoudWhenStudyingSleeping',
    'Concern_Eating my food': 'Concern_EatingMyFood',
    'Concern_Not picking up after themselves': 'Concern_NotPickingUpAfterThemselves',
    'Concern_Being unfriendly': 'Concern_Unfriendly',
    'Concern_Lack of respect for me and my privacy': 'Concern_LackOfRespect',
    'Concern_Not being open-minded': 'Concern_NotOpenMinded',
    'Concern_Bad personal hygiene': 'Concern_BadHygiene',
    'Concern_Using my things without asking': 'Concern_UsingMyThings',
    'Concern_Having guests over too much': 'Concern_GuestsOverTooMuch',
    'Concern_Messy': 'Concern_Messy',
    'Ideal_Study_Environment_Description': 'Ideal_Study_Environment',
    'Description_of_Personal_Room_At_Home': 'Personal_Room_Description',
    'Bedtime_Preference': 'Bedtime_Preference',
    'Wake_Up_Time_Preference': 'Wake_Up_Time_Preference',
    'Planned_Study_Time_Preference': 'Planned_Study_Time',
    'Private_Time_Requirements': 'Private_Time_Requirements',
    'Guest_Frequency_Preference': 'Guest_Frequency_Preference',
    'Expected_Study_Time_Preference': 'Expected_Study_Time',
    'Conflict_Handling_Method': 'Conflict_Handling_Method',
    'Preferred_Roommate_Relationship_Type': 'Preferred_Roommate_Relationship',
    'Desired_Room_Attributes': 'Desired_Room_Attributes',
    'Gender_encoded': 'Gender',
    'Age_encoded': 'Age',
    'Faculty_encoded': 'Faculty',
    'Religion_encoded': 'Religion',
    'Parents_Abroad_encoded': 'Parents_Abroad',
    'Living_with_Parents_encoded': 'Living_with_Parents',
    'Do_you_smoke_encoded': 'Do_you_smoke',
    'Attitude_towards_Roommate_Smoking_encoded': 'Attitude_towards_Roommate_Smoking',
    'Attitude_towards_Borrowing_Sharing_encoded': 'Attitude_towards_Borrowing_Sharing',
    'Study_Time_Preference_encoded': 'Study_Time_Preference',
    'Communication_Preference_with_Roommate_encoded': 'Communication_Preference_with_Roommate',
    'Age_normalized': 'Age_normalized'
}

encoded_df.rename(columns=new_column_names, inplace=True)

# Now df.columns will have the updated column names without "_encoded"


In [69]:
# Checking the number of rows and columns

num_rows, num_columns = encoded_df.shape

print(f"Number of Rows: {num_rows}")
print(f"Number of Columns: {num_columns}\n")

Number of Rows: 256
Number of Columns: 34



#### nominal feautures encoding

In [70]:
encoded_df.to_csv('../../data/processed/Roommate_Compatibility_Survey.csv', index=False)