# Importing and preparing rental apartments data

## Libraries and settings

In [2]:
# Libraries
import os
import re
import time
import fnmatch
import numpy as np
import pandas as pd

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Importing data

In [4]:
# Get current working directory
print(os.getcwd())

# Show all files in the directory
flist = fnmatch.filter(os.listdir('.'), '*.csv')
for i in flist:
    print(i)

# Read the data to a pandas data frame
df = pd.read_csv("amazon.csv", sep=',', encoding='utf-8')

C:\Workspace_ZHAW\DA\test
amazon.csv


## Count number of rows and columns in the data frame

In [5]:
# Dimension (rows, columns)
print('Dimension:', df.shape)

# Number of rows
print('Number of rows:', df.shape[0])

# Number of columns
print('Number of columns:', df.shape[1])

Dimension: (250, 13)
Number of rows: 250
Number of columns: 13


## Get data types (raw-format from web scraping)

In [6]:
# Get data types (note that in pandas, a string is referred to as 'object')
df.dtypes

web-scraper-order        object
web-scraper-start-url    object
single                   object
single-href              object
title                    object
price                    object
info                     object
rating                   object
instock                  object
asin                     object
bestseller               object
dimension                object
img1-src                 object
dtype: object

## Extract and save relevant information from raw data using regular expressions (regex)

### Extract number of rooms

In [8]:
# Extract values from 'rooms_area_price_raw' strings
rooms = []
for i in df['rating']:
    d1 = re.findall('(.*)Sterne', i)
    try:
        d2 = d1[0].strip().replace(',', '.')
    except:
        d2 = None
    rooms.append(d2)

# Save as new variable in the pandas data frame
df['rating'] = pd.Series(rooms)
    
# Print first 5 values
print(df['rating'].head(5), '\n')
print(df['rating'].head(5))

0    3.5 von 5
1    4.7 von 5
2    3.9 von 5
3    4.6 von 5
4    3.9 von 5
Name: rating, dtype: object 

0    3.5 von 5
1    4.7 von 5
2    3.9 von 5
3    4.6 von 5
4    3.9 von 5
Name: rating, dtype: object


### Extract living area

In [16]:
# Extract values from 'rooms_area_price_raw' strings
area = []
for i in df['title']:
    d1 = re.findall('Stifte, (.*) stifte', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    area.append(d2)


# Print first 5 values
print(df['title'].head(5), '\n')

0      Fridolin 68066 Adressbuch Hokusai - Große Welle
1     Winsor & Newton Universität Pinsel Kit (4 Stück)
2    10 Stück Kreidetafel-Schild, 33 x 40,6 cm, gro...
3    Sharpie Fluo XL Textmarker | Keilspitze | gemi...
4    Original bunte Lesezeichen | Schöne Haftnotize...
Name: title, dtype: object 



### Extract rental price

In [22]:
# Extract values from 'price_raw' strings
price = []
for i in df['price']:
    d1 = re.findall('[0-9]+', i)
    try:
        d2 = d1[0].strip()
    except:
        d2 = None
    price.append(d2)

# Save as new variable in the pandas data frame
df['price'] = pd.Series(price)

# Print first 5 values
print(df['price'].head(5), '\n')
print(df['price'].head(5))

TypeError: expected string or bytes-like object

### Get data types of all variables including the new ones

In [29]:
df.dtypes
df.loc [df['rating'] == 4.5][['title', 'instock']]

Unnamed: 0,title,instock


## Count and identify missing values (if any)

In [30]:
# Count missing values
print(pd.isna(df).sum())

# Identify rows with missing values
df[df.isna().any(axis=1)]

web-scraper-order          0
web-scraper-start-url      0
single                     0
single-href                0
title                      0
price                    100
info                      10
rating                     0
instock                   96
asin                      32
bestseller                46
dimension                173
img1-src                 246
dtype: int64


Unnamed: 0,web-scraper-order,web-scraper-start-url,single,single-href,title,price,info,rating,instock,asin,bestseller,dimension,img1-src
0,1669910832-1,https://www.amazon.de/s?k=university+supplies&...,Fridolin 68066 Adressbuch Hokusai - Große Welle,https://www.amazon.de/Fridolin-68066-Adressbuc...,Fridolin 68066 Adressbuch Hokusai - Große Welle,"8,66€",Motiv: Katsushika Hokusai - Große Welle,3.5 von 5,,B00ALD6A2S,,‎15 x 12.5 x 2 cm; 99.79 Gramm,
1,1669910834-2,https://www.amazon.de/s?k=university+supplies&...,Winsor & Newton Universität Pinsel Kit (4 Stück),https://www.amazon.de/Winsor-Newton-Universit%...,Winsor & Newton Universität Pinsel Kit (4 Stück),"15,72€",Info zu diesem Artikel,4.7 von 5,,B071F4WRZS,,‎28 x 6 x 1.2 cm; 40 Gramm,
2,1669910837-3,https://www.amazon.de/s?k=university+supplies&...,"10 Stück Kreidetafel-Schild, 33 x 40,6 cm, gro...",https://www.amazon.de/Kreidetafel-Schild-doppe...,"10 Stück Kreidetafel-Schild, 33 x 40,6 cm, gro...",,10 Stück Kreidetafel-Schilder: Die bunte Wimpe...,3.9 von 5,,B0B19X5C5F,,,
3,1669910840-4,https://www.amazon.de/s?k=university+supplies&...,Sharpie Fluo XL Textmarker | Keilspitze | gemi...,https://www.amazon.de/Sharpie-Textmarker-Keils...,Sharpie Fluo XL Textmarker | Keilspitze | gemi...,"10,40€",Info zu diesem Artikel,4.6 von 5,Nur noch 14 auf Lager,B00BWK90DK,"Nr. 66,297 in Bürobedarf & Schreibwaren (Siehe...",‎15 x 19.4 x 2.2 cm; 90 Gramm,
4,1669910843-5,https://www.amazon.de/s?k=university+supplies&...,Original bunte Lesezeichen | Schöne Haftnotize...,https://www.amazon.de/Lesezeichen-verschiedene...,Original bunte Lesezeichen | Schöne Haftnotize...,"16,00€",Info zu diesem Artikel,3.9 von 5,Auf Lager.,B0B7TFN96Z,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,1669911603-246,https://www.amazon.de/s?k=university+supplies&...,Wandkalender 2022 2023 von SmartPanda - Kalend...,https://www.amazon.de/Wandkalender-2022-2023-S...,Wandkalender 2022 2023 von SmartPanda - Kalend...,"12,56€",Info zu diesem Artikel,4.6 von 5,,B09MLSD1BH,"Nr. 5,578 in Bürobedarf & Schreibwaren (Siehe ...",,
246,1669911606-247,https://www.amazon.de/s?k=university+supplies&...,"Oxford Collegeblock A4 punktkariert/dotted, 80...",https://www.amazon.de/Oxford-400155143-Schule-...,"Oxford Collegeblock A4 punktkariert/dotted, 80...","17,96€",Mit 80 Blatt (160 Seiten) bietet der Notizbloc...,4.8 von 5,Auf Lager.,B091FWJH7P,"Nr. 1,752 in Bürobedarf & Schreibwaren (Siehe ...",‎29.5 x 22 x 4.65 cm; 2.56 Kilogramm,
247,1669911609-248,https://www.amazon.de/s?k=university+supplies&...,TOYMYTOY Spiralblock Business Notizbuch Ringbu...,https://www.amazon.de/TOYMYTOY-Spiralblock-Not...,TOYMYTOY Spiralblock Business Notizbuch Ringbu...,,Info zu diesem Artikel,4.0 von 5,Auf Lager.,B078XDK5MH,"Nr. 34,336 in Bürobedarf & Schreibwaren (Siehe...",,
248,1669911612-249,https://www.amazon.de/s?k=university+supplies&...,"SUPERTOOL Projekt-Notizblock, A5, lose Blätter...",https://www.amazon.de/Supertool-Projektnotizbl...,Wie bewerten Sie heute Ihre Erfahrung beim Kau...,,,3.5 von 5,Nur noch 18 auf Lager,,,,


## Count and identify duplicated values (if any)

In [31]:
# Count duplicated values
print(df.duplicated().sum())

# Identify rows with duplicated values, e.g.:
df[df[['web-scraper-order', 'price', 'title']].duplicated()]

0


Unnamed: 0,web-scraper-order,web-scraper-start-url,single,single-href,title,price,info,rating,instock,asin,bestseller,dimension,img1-src


### Save data to file

In [32]:
df.to_csv('amazon_prepared.csv', 
          sep=",", 
          encoding='utf-8',
          index=False)

### Jupyter notebook --footer info-- (please always provide this at the end of each submitted notebook)

In [33]:
import os
import platform
import socket
from platform import python_version
from datetime import datetime

print('-----------------------------------')
print(os.name.upper())
print(platform.system(), '|', platform.release())
print('Datetime:', datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
print('Python Version:', python_version())
print('-----------------------------------')

-----------------------------------
NT
Windows | 10
Datetime: 2022-12-01 17:42:17
Python Version: 3.9.7
-----------------------------------
