# EDA & ETL Part

In [1]:
#download needed packages
!pip install python-dotenv

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting python-dotenv
  Downloading python_dotenv-0.21.0-py3-none-any.whl (18 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.21.0


In [2]:
#import needed packages
import os
import pandas as pd
import zipfile as zp
import sqlalchemy as conn
from dotenv import dotenv_values
#Declaring main drive folder path
os.environ['KAGGLE_CONFIG_DIR'] = '/content'

In [3]:
#importing our secrets from environment varibales
config = dotenv_values(".env")

In [4]:
#lets import Sakani dataset from kaggle
!kaggle datasets download -d majedalhulayel/sakani-projects-saudi-arabia

Downloading sakani-projects-saudi-arabia.zip to /content
  0% 0.00/25.5k [00:00<?, ?B/s]
100% 25.5k/25.5k [00:00<00:00, 29.7MB/s]


In [5]:
#lets unzip the file containing the dataset
dataset_folder = 'dataset'
dataset_path = 'sakani-projects-saudi-arabia.zip'
try:
  if os.path.exists(dataset_folder):
    with zp.ZipFile(dataset_path) as data: #original file path
      data.extractall(dataset_folder) #saving path
      print(f"Done extracting all files to: {dataset_folder}")
  else:
    print(f'Creating new folder: {dataset_folder}\n')
    os.mkdir(dataset_folder)
    with zp.ZipFile(dataset_path) as data: #original file path
      data.extractall(dataset_folder) #saving path
    print(f"Done extracting all files to: {dataset_folder}")
except:
  print("Invalid file")

Creating new folder: dataset

Done extracting all files to: dataset


In [6]:
#lets see the dataset
df = pd.read_csv("dataset/Sakani Projects.csv")
df.head()

Unnamed: 0,id,location_lat,location_lon,city_id,city_name_ar,city_name_en,region_id,region_key,region_name_ar,region_name_en,...,resource_id,resource_type,subsidizable,type,under_construction_status,unit_types_0,unit_types_1,unit_types_2,use_register_interest_flag,views_count
0,project_1044,24.560109,46.519438,14,الرياض,AR RIYADH,1,riyadh,الرياض,Riyadh,...,1044,projects,1,marketplaces,under_construction,apartment,,,0,1039
1,project_1033,24.793165,46.633768,14,الرياض,AR RIYADH,1,riyadh,الرياض,Riyadh,...,1033,projects,1,marketplaces,under_construction,apartment,,,0,1837
2,project_681,24.058337,42.836919,3025,عفيف,AFIF,1,riyadh,الرياض,Riyadh,...,681,projects,1,marketplaces,,land,,,0,5995
3,project_378,18.446355,42.098271,3274,محايل,MUHAYIL,7,asir,عسير,'Asir,...,378,projects,1,marketplaces,,land,,,0,372872
4,project_386,24.545443,44.404584,716,الدوادمي,AD DUWADIMI,1,riyadh,الرياض,Riyadh,...,386,projects,1,marketplaces,,land,,,0,50459


In [7]:
#lets see some info about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 386 entries, 0 to 385
Data columns (total 51 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  386 non-null    object 
 1   location_lat                        386 non-null    float64
 2   location_lon                        386 non-null    float64
 3   city_id                             386 non-null    int64  
 4   city_name_ar                        386 non-null    object 
 5   city_name_en                        386 non-null    object 
 6   region_id                           386 non-null    int64  
 7   region_key                          386 non-null    object 
 8   region_name_ar                      386 non-null    object 
 9   region_name_en                      386 non-null    object 
 10  region_order_sequence               386 non-null    int64  
 11  availability                        386 non-n

In [8]:
#lets deepdive into the dataset, to learn more
df.columns

Index(['id', 'location_lat', 'location_lon', 'city_id', 'city_name_ar',
       'city_name_en', 'region_id', 'region_key', 'region_name_ar',
       'region_name_en', 'region_order_sequence', 'availability',
       'available_units_count', 'available_units_for_auctions_count',
       'basement', 'bookable', 'city_order_sequence', 'delegated_by_broker',
       'developer_name', 'driver_room', 'elevator', 'group_unit_id',
       'maid_room', 'max_area', 'max_bathroom', 'max_bed_room', 'max_price',
       'max_street_width', 'max_unit_age', 'min_area', 'min_bathroom',
       'min_bed_room', 'min_price', 'min_street_width', 'min_unit_age', 'pool',
       'project_name', 'project_type', 'promoted', 'publish', 'publish_date',
       'resource_id', 'resource_type', 'subsidizable', 'type',
       'under_construction_status', 'unit_types_0', 'unit_types_1',
       'unit_types_2', 'use_register_interest_flag', 'views_count'],
      dtype='object')

In [9]:
#unique unite types
df['unit_types_0'].unique()

array(['apartment', 'land', 'townhouse', 'villa'], dtype=object)

In [10]:
#unique project type
df['project_type'].unique()

array(['offplan_private_land', 'lands_moh_land', 'offplan_moh_land',
       'readymade_private_land', 'readymade_moh_land',
       'lands_private_land', 'advanced_self_construction'], dtype=object)

In [11]:
#unique developer name
df['developer_name'].unique()

array(['شركة المسكن المحدودة',
       'شركة جود العمارة للتطوير والاستثمار العقاري', nan,
       'الشركة الوطنية للإسكان', 'علي شار', 'الرائم للمقاولات',
       'دار واعمار للاستثمار والتطوير العقاري',
       'شركة الأثير للتطوير والاستثمار العقاري',
       'شركة الفاف للتطوير العقاري', 'عبدالرحمن الراشد وأولاده', 'رتال',
       'شركة الأولى لتطوير العقارات', 'ثبات المسكن العقارية',
       'مكيون مطورون عمرانيون', 'شركة سمو العقارية مساهمة مدرجة',
       'شركة درة الذهبية للتطوير العقاري',
       'شركة مرسى البحار للمقاولات العامة', 'شركة أفاق الذهبية المحدودة',
       'شركة عراب التمليك للتطوير العقاري',
       'شركة ساطعة التعمير للتطوير والاستثمار العقاري',
       'شركة تمكين للاستثمار و التطوير العقارى',
       'فرع شركة خطى الخير للتشغيل والصيانة',
       'شركة الحناكي للتطوير العقاري',
       'شركة فيصل عبدالله بن سعيدان للاستثمار والتطويرالعقاري',
       'شركة مساكن أفياء للتطوير والاستثمار العقاري',
       'شركة محمد عبدالعزيز الحبيب وشركاؤه للاستثمار العقاري',
       'شركة دور

In [12]:
#unique region key
df['region_key'].unique()

array(['riyadh', 'asir', 'qassim', 'northern_borders', 'makkah', 'jizan',
       'hail', 'madinah', 'tabuk', 'jawf', 'eastern_province', 'najran',
       'bahah'], dtype=object)

In [13]:
#columns before drop
df.columns

Index(['id', 'location_lat', 'location_lon', 'city_id', 'city_name_ar',
       'city_name_en', 'region_id', 'region_key', 'region_name_ar',
       'region_name_en', 'region_order_sequence', 'availability',
       'available_units_count', 'available_units_for_auctions_count',
       'basement', 'bookable', 'city_order_sequence', 'delegated_by_broker',
       'developer_name', 'driver_room', 'elevator', 'group_unit_id',
       'maid_room', 'max_area', 'max_bathroom', 'max_bed_room', 'max_price',
       'max_street_width', 'max_unit_age', 'min_area', 'min_bathroom',
       'min_bed_room', 'min_price', 'min_street_width', 'min_unit_age', 'pool',
       'project_name', 'project_type', 'promoted', 'publish', 'publish_date',
       'resource_id', 'resource_type', 'subsidizable', 'type',
       'under_construction_status', 'unit_types_0', 'unit_types_1',
       'unit_types_2', 'use_register_interest_flag', 'views_count'],
      dtype='object')

In [14]:
#un_wanted columns to delete
df.drop(['city_id','region_id','region_key','region_order_sequence','city_order_sequence','group_unit_id','promoted','unit_types_1', \
          'unit_types_2','type','resource_id', 'resource_type', 'subsidizable'], axis=1, inplace=True)

In [15]:
#columns after drop
df.columns

Index(['id', 'location_lat', 'location_lon', 'city_name_ar', 'city_name_en',
       'region_name_ar', 'region_name_en', 'availability',
       'available_units_count', 'available_units_for_auctions_count',
       'basement', 'bookable', 'delegated_by_broker', 'developer_name',
       'driver_room', 'elevator', 'maid_room', 'max_area', 'max_bathroom',
       'max_bed_room', 'max_price', 'max_street_width', 'max_unit_age',
       'min_area', 'min_bathroom', 'min_bed_room', 'min_price',
       'min_street_width', 'min_unit_age', 'pool', 'project_name',
       'project_type', 'publish', 'publish_date', 'under_construction_status',
       'unit_types_0', 'use_register_interest_flag', 'views_count'],
      dtype='object')

In [16]:
#len of the new columns
print(f'Length of Data (After Column Filtered): {len(df.columns)}')

Length of Data (After Column Filtered): 38


In [17]:
#lets clean row-level data
df['developer_name'].fillna('لا يوجد مدخل', inplace=True)
df['publish_date'].ffill(inplace=True) #filling nan values with prev value
df['under_construction_status'].fillna('no entry', inplace=True)

In [18]:
#lets see data after cleaning
df.head()

Unnamed: 0,id,location_lat,location_lon,city_name_ar,city_name_en,region_name_ar,region_name_en,availability,available_units_count,available_units_for_auctions_count,...,min_unit_age,pool,project_name,project_type,publish,publish_date,under_construction_status,unit_types_0,use_register_interest_flag,views_count
0,project_1044,24.560109,46.519438,الرياض,AR RIYADH,الرياض,Riyadh,1,14,0,...,0,0,المسكن ١٢ - الرياض,offplan_private_land,1,2022-08-30,under_construction,apartment,0,1039
1,project_1033,24.793165,46.633768,الرياض,AR RIYADH,الرياض,Riyadh,1,0,0,...,0,0,جلامور - الرياض,offplan_private_land,1,2022-08-29,under_construction,apartment,0,1837
2,project_681,24.058337,42.836919,عفيف,AFIF,الرياض,Riyadh,1,61,0,...,0,0,مخطط 879 - العيدانية,lands_moh_land,1,2022-06-30,no entry,land,0,5995
3,project_378,18.446355,42.098271,محايل,MUHAYIL,عسير,'Asir,0,0,0,...,0,0,مخطط جوهرة محايل - عسير,lands_moh_land,1,2022-01-20,no entry,land,0,372872
4,project_386,24.545443,44.404584,الدوادمي,AD DUWADIMI,الرياض,Riyadh,0,0,0,...,0,0,مخطط طيبة العليا - الدوادمي,lands_moh_land,1,2022-01-20,no entry,land,0,50459


In [19]:
#lets create connection to db
engine = conn.create_engine(config.get('DB_CONN'))

In [21]:
#lets save the new data to diff location
df.to_csv(dataset_folder+'/cleaned_data.csv', index=False)
#loading into cloud based database
df.to_sql('sakani', engine, if_exists='replace', index=False)

# Analysis Part