In [1]:
import pandas as pd
import numpy as np

In [2]:
data_file = "data/project-dataset-final.xlsx"
data = pd.read_excel(data_file, index_col=None)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 17 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   web-scraper-order      2858 non-null   object 
 1   web-scraper-start-url  2858 non-null   object 
 2   pagination             2751 non-null   object 
 3   listing-title          2858 non-null   object 
 4   listing-description    2858 non-null   object 
 5   listing-housing-type   2858 non-null   object 
 6   listing-features       2858 non-null   object 
 7   listing-notices        2858 non-null   object 
 8   listing-id             2858 non-null   object 
 9   listing-link           2858 non-null   object 
 10  listing-link-href      2858 non-null   object 
 11  listing-availability   1534 non-null   object 
 12  listing-posting-date   2858 non-null   object 
 13  listing-address        2098 non-null   object 
 14  listing-map-latitude   2857 non-null   float64
 15  list

In [3]:
############################################### Creating a Data Quality Report ###############################################
## Get the data types of each column of the dataset
data_types = pd.DataFrame(
    data.dtypes,
    columns=['Data Type']
)
data_types

Unnamed: 0,Data Type
web-scraper-order,object
web-scraper-start-url,object
pagination,object
listing-title,object
listing-description,object
listing-housing-type,object
listing-features,object
listing-notices,object
listing-id,object
listing-link,object


In [4]:
## Get the total values in the each column
total_values = pd.DataFrame(
    data.count(),
    columns=['Total Values']
)
total_values

Unnamed: 0,Total Values
web-scraper-order,2858
web-scraper-start-url,2858
pagination,2751
listing-title,2858
listing-description,2858
listing-housing-type,2858
listing-features,2858
listing-notices,2858
listing-id,2858
listing-link,2858


In [5]:
## Check total missing values in each column of the dataset
missing_data = pd.DataFrame(
    data.isnull().sum(),
    columns=['Missing Values']
)
missing_data

Unnamed: 0,Missing Values
web-scraper-order,0
web-scraper-start-url,0
pagination,107
listing-title,0
listing-description,0
listing-housing-type,0
listing-features,0
listing-notices,0
listing-id,0
listing-link,0


In [6]:
## Check total unique values in each column of the dataset
unique_values = pd.DataFrame(
    data.nunique(),
    columns=['Unique Values']
)
unique_values

Unnamed: 0,Unique Values
web-scraper-order,2858
web-scraper-start-url,1
pagination,24
listing-title,2404
listing-description,2324
listing-housing-type,35
listing-features,643
listing-notices,2
listing-id,2858
listing-link,1680


In [7]:
dq_report = data_types.join(total_values).join(missing_data).join(unique_values)
dq_report

Unnamed: 0,Data Type,Total Values,Missing Values,Unique Values
web-scraper-order,object,2858,0,2858
web-scraper-start-url,object,2858,0,1
pagination,object,2751,107,24
listing-title,object,2858,0,2404
listing-description,object,2858,0,2324
listing-housing-type,object,2858,0,35
listing-features,object,2858,0,643
listing-notices,object,2858,0,2
listing-id,object,2858,0,2858
listing-link,object,2858,0,1680


In [8]:
## Get the listing-description fields from the data
listing_description = pd.DataFrame(
    data['listing-description'],
    columns=['listing-description']
)
listing_description.head(10)

Unnamed: 0,listing-description
0,QR Code Link to This Post\n \n ...
1,QR Code Link to This Post\n \n ...
2,QR Code Link to This Post\n \n ...
3,QR Code Link to This Post\n \n ...
4,QR Code Link to This Post\n \n ...
5,QR Code Link to This Post\n \n ...
6,QR Code Link to This Post\n \n ...
7,QR Code Link to This Post\n \n ...
8,QR Code Link to This Post\n \n ...
9,QR Code Link to This Post\n \n ...


In [54]:
import re
regex_tab = re.compile(r'(\s{2,})', flags=re.IGNORECASE)
regex_newline = re.compile(r'\n', flags=re.IGNORECASE)
description = listing_description['listing-description'].str.replace(regex_newline, ';', regex=True)
description = description.str.replace(regex_tab, '', regex=True)
regex_semicol = re.compile(r';+', flags=re.IGNORECASE)
description = description.str.replace(regex_semicol, ';', regex=True)
# cols = description.select_dtypes(object).columns
# new_description = description[cols].apply(lambda x: x.str.strip())
#
# import re
# new_description.replace(to_replace=[None, u''], value=np.nan, inplace=True)
description = description.str.strip()
description

0       QR Code Link to This Post;Apartment home avail...
1       QR Code Link to This Post;Call Apartment Guys ...
2       QR Code Link to This Post;Contact me to learn ...
3       QR Code Link to This Post;PROPERTY INFO;ID: 23...
4       QR Code Link to This Post;4535 N Paulina Unit ...
                              ...                        
2853    QR Code Link to This Post;Interested in this p...
2854    QR Code Link to This Post;We are conveniently ...
2855    QR Code Link to This Post;Interested in this p...
2856    QR Code Link to This Post;PROPERTY INFO;ID: 23...
2857    QR Code Link to This Post;1BR / 1Ba;apartment;...
Name: listing-description, Length: 2858, dtype: object

In [176]:
description = new_description.dropna(how='all', axis=1)
description

Unnamed: 0,0,3,4,5,6,7,8,9,10,11,...,205,207,211,213,217,219,223,225,231,237
0,QR Code Link to This Post,Apartment home available July 1st on Chicago’s...,,,,"$500 OFF SECOND MONTH""S RENT!!",,,,Building Features,...,,,,,,,,,,
1,QR Code Link to This Post,Call Apartment Guys at 773-549-3474 and ask fo...,,,,,,,,,...,,,,,,,,,,
2,QR Code Link to This Post,Contact me to learn more about an innovative L...,,Basic Requirements to Qualify**:,,• FICO Score 620+,,"• Annual household income of $60,000+",,• Max Debt To Income (DTI) Ratio is 50% (45% f...,...,,,,,,,,,,
3,QR Code Link to This Post,PROPERTY INFO,,"ID: 232839975Rent: $3,500 / MonthBeds: 2Bath: ...",,VIDEO TOUR: www.chicagodoorstep.com/washington,,,,Parking and large storage unit INCLUDED!!,...,,,,,,,,,,
4,QR Code Link to This Post,"4535 N Paulina Unit #3D Chicago, IL 60640",,,,Bright & Sunny 1 bedroom unit in Ravenswood,,,,"Close to the Brown Line train Montrose stop, a...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2853,QR Code Link to This Post,Interested in this property?,,,,Click on: Reply to this listing for more infor...,,,,"** if you are texting us, please include the l...",...,,,,,,,,,,
2854,QR Code Link to This Post,"We are conveniently located near Edens, Metra ...",,,,Located in Skokie near both Old Orchard Shoppi...,,,,,...,,,,,,,,,,
2855,QR Code Link to This Post,Interested in this property?,,,,Click on: Reply to this listing for more infor...,,,,"** if you are texting us, please include the l...",...,,,,,,,,,,
2856,QR Code Link to This Post,PROPERTY INFO,,"ID: 237708192Rent: $3,147 / MonthBeds: 3Bath: ...",,Available Aug 8th and Sept 9th,,Will hold for 30 days,,,...,,,,,,,,,,


In [192]:
# Initialize columns
cols_concat = description.columns

# Convert them to type str
description[cols_concat].astype('str')

# Then concatenate them as follows
features = description[cols_concat].T.agg('\X0'.join)
features

0       QR Code Link to This Post\tApartment home avai...
1       QR Code Link to This Post\tCall Apartment Guys...
2       QR Code Link to This Post\tContact me to learn...
3       QR Code Link to This Post\tPROPERTY INFO\tnan\...
4       QR Code Link to This Post\t4535 N Paulina Unit...
                              ...                        
2853    QR Code Link to This Post\tInterested in this ...
2854    QR Code Link to This Post\tWe are conveniently...
2855    QR Code Link to This Post\tInterested in this ...
2856    QR Code Link to This Post\tPROPERTY INFO\tnan\...
2857    QR Code Link to This Post\t1BR / 1Ba\tnan\tapa...
Length: 2858, dtype: object

In [1]:
features.to_excel("data/description.xlsx", index=False)
features

NameError: name 'features' is not defined