### Explanatory Data Analysis in Python



In [1]:
# Libraries for Data wrangling
import pandas as pd
import numpy as np

# Libraries for Data Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px


In [2]:
# load the dataset
# .head method

rent_df = pd.read_csv('House_Rent_Dataset.csv')

rent_df.head(4)

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
0,5/18/2022,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner
1,5/13/2022,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
2,5/16/2022,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner
3,7/4/2022,2,10000,800,1 out of 2,Super Area,Dumdum Park,Kolkata,Unfurnished,Bachelors/Family,1,Contact Owner


In [3]:
rent_df.to_csv('cleaned.csv')

## Methods for Data wrangling in Pandas and Numpy

In [7]:
# .tail method

rent_df.tail()

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact
4741,5/18/2022,2,15000,1000,3 out of 5,Carpet Area,Bandam Kommu,Hyderabad,Semi-Furnished,Bachelors/Family,2,Contact Owner
4742,5/15/2022,3,29000,2000,1 out of 4,Super Area,"Manikonda, Hyderabad",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Owner
4743,7/10/2022,3,35000,1750,3 out of 5,Carpet Area,"Himayath Nagar, NH 7",Hyderabad,Semi-Furnished,Bachelors/Family,3,Contact Agent
4744,7/6/2022,3,45000,1500,23 out of 34,Carpet Area,Gachibowli,Hyderabad,Semi-Furnished,Family,2,Contact Agent
4745,5/4/2022,2,15000,1000,4 out of 5,Carpet Area,Suchitra Circle,Hyderabad,Unfurnished,Bachelors,2,Contact Owner


In [4]:
# .shape attribute

rent_df.shape

(4746, 12)

In [9]:
# .info method

rent_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4746 entries, 0 to 4745
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4746 non-null   object
 1   BHK                4746 non-null   int64 
 2   Rent               4746 non-null   int64 
 3   Size               4746 non-null   int64 
 4   Floor              4746 non-null   object
 5   Area Type          4746 non-null   object
 6   Area Locality      4746 non-null   object
 7   City               4746 non-null   object
 8   Furnishing Status  4746 non-null   object
 9   Tenant Preferred   4746 non-null   object
 10  Bathroom           4746 non-null   int64 
 11  Point of Contact   4746 non-null   object
dtypes: int64(4), object(8)
memory usage: 445.1+ KB


In [7]:
# Extracting 

pattern = r'(.*) out of (.*)'

rent_df[['house_floor', 'no_floor']] = rent_df['Floor'].str.extract(pattern)

rent_df.head(3)

Unnamed: 0,Posted On,BHK,Rent,Size,Floor,Area Type,Area Locality,City,Furnishing Status,Tenant Preferred,Bathroom,Point of Contact,house_floor,no_floor
0,5/18/2022,2,10000,1100,Ground out of 2,Super Area,Bandel,Kolkata,Unfurnished,Bachelors/Family,2,Contact Owner,Ground,2
1,5/13/2022,2,20000,800,1 out of 3,Super Area,"Phool Bagan, Kankurgachi",Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3
2,5/16/2022,2,17000,1000,1 out of 3,Super Area,Salt Lake City Sector 2,Kolkata,Semi-Furnished,Bachelors/Family,1,Contact Owner,1,3


In [8]:
# .drop method

rent_df.drop(columns = ['Floor'], inplace = True)

In [9]:
# .column attribute to return the available method

rent_df.columns

Index(['Posted On', 'BHK', 'Rent', 'Size', 'Area Type', 'Area Locality',
       'City', 'Furnishing Status', 'Tenant Preferred', 'Bathroom',
       'Point of Contact', 'house_floor', 'no_floor'],
      dtype='object')

In [10]:
# inspect column content with .unique method

rent_df['house_floor'].unique()

array(['Ground', '1', '2', '4', '3', '5', '7', '8', 'Upper Basement',
       '11', 'Lower Basement', '6', '14', '43', '13', '18', '17', '9',
       '19', '60', '34', '12', '26', '25', '53', '16', '10', '39', '32',
       '47', '28', '20', '15', '65', '40', '37', '22', '21', '30', '35',
       '33', '44', '41', '46', '27', '45', '48', '50', '24', '23', '29',
       '49', '36', '76', nan], dtype=object)

In [11]:
# inspecting missing values

rent_df.isnull().sum()

Posted On            0
BHK                  0
Rent                 0
Size                 0
Area Type            0
Area Locality        0
City                 0
Furnishing Status    0
Tenant Preferred     0
Bathroom             0
Point of Contact     0
house_floor          4
no_floor             4
dtype: int64

In [12]:
# Handling missing values with dropna or fillna method

rent_df.dropna(inplace= True)



In [13]:
rent_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4742 entries, 0 to 4745
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Posted On          4742 non-null   object
 1   BHK                4742 non-null   int64 
 2   Rent               4742 non-null   int64 
 3   Size               4742 non-null   int64 
 4   Area Type          4742 non-null   object
 5   Area Locality      4742 non-null   object
 6   City               4742 non-null   object
 7   Furnishing Status  4742 non-null   object
 8   Tenant Preferred   4742 non-null   object
 9   Bathroom           4742 non-null   int64 
 10  Point of Contact   4742 non-null   object
 11  house_floor        4742 non-null   object
 12  no_floor           4742 non-null   object
dtypes: int64(4), object(9)
memory usage: 518.7+ KB


In [14]:
# .reset_index

rent_df.reset_index(drop= True, inplace= True)

In [15]:
# ASSIGNMENT
# install scipy
# statsmodels
#sklearn
#yellowbrick

In [10]:
!pip install scipy



In [17]:
!pip install statsmodels

Collecting statsmodels
  Using cached statsmodels-0.14.1-cp39-cp39-win_amd64.whl (10.0 MB)
Collecting scipy!=1.9.2,>=1.4
  Downloading scipy-1.12.0-cp39-cp39-win_amd64.whl (46.2 MB)
Collecting patsy>=0.5.4
  Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
Installing collected packages: scipy, patsy, statsmodels
Successfully installed patsy-0.5.6 scipy-1.12.0 statsmodels-0.14.1


You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.


In [11]:
!pip install sklearn



In [19]:
!pip install yellowbrick

Collecting yellowbrick
  Downloading yellowbrick-1.5-py3-none-any.whl (282 kB)
Installing collected packages: yellowbrick
Successfully installed yellowbrick-1.5


You should consider upgrading via the 'C:\Users\Administrator\AppData\Local\Programs\Python\Python39\python.exe -m pip install --upgrade pip' command.
