## Basics

In [79]:
#All import statements

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Binarizer, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
import scipy as sc
#plotting
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
dataframe = pd.read_csv("real_estate.csv") 

In [8]:
dataframe

Unnamed: 0,price,size,year
0,234314.144,643.09,2015
1,228581.528,656.22,2009
2,281626.336,487.29,2018
3,401255.608,1504.75,2015
4,458674.256,1275.46,2009
...,...,...,...
95,252460.400,549.80,2009
96,310522.592,1037.44,2009
97,383635.568,1504.75,2006
98,225145.248,648.29,2015


**Preprocessing - Scaling**

using SkLearn tools

In [9]:
array = dataframe.values

In [28]:
#Scaling Size on range 0,10
size = array[:,1:2] 
price = array[:,0] 
scaler = MinMaxScaler(feature_range=(0, 10)) 
rescaledSize = scaler.fit_transform(size) 
  
# summarize transformed data 
np.set_printoptions(precision=3) 
print(rescaledSize[0:5]) ##output will be in the range 0-10

[[1.199]
 [1.295]
 [0.055]
 [7.522]
 [5.839]]


In [29]:
#Binarizing Year, classifying all year values after 2015 as 1
year = array[:,2:3]
binarizer = Binarizer(threshold=2015).fit(year)
binaryYear = binarizer.transform(year)
print(binaryYear[0:5])

[[0.]
 [0.]
 [1.]
 [0.]
 [0.]]


In [31]:
#Standardizing Size
scaler = StandardScaler().fit(size)
standardSize = scaler.transform(size)
print(standardSize[0:5])

[[-0.708]
 [-0.664]
 [-1.234]
 [ 2.198]
 [ 1.425]]


**Cleansing**

In [51]:
df = pd.read_csv('BL-Flickr-Images-Book.csv')
df.head()

Unnamed: 0,Identifier,Edition Statement,Place of Publication,Date of Publication,Publisher,Title,Author,Contributors,Corporate Author,Corporate Contributors,Former owner,Engraver,Issuance type,Flickr URL,Shelfmarks
0,206,,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,"FORBES, Walter.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12641.b.30.
1,216,,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12626.cc.2.
2,218,,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.","BLAZE DE BURY, Marie Pauline Rose - Baroness",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 12625.dd.1.
3,472,,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.","Appleyard, Ernest Silvanus.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 10369.bbb.15.
4,480,"A new edition, revised, etc.",London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.","BROOME, John Henry.",,,,,monographic,http://www.flickr.com/photos/britishlibrary/ta...,British Library HMNTS 9007.d.28.


In [52]:
to_drop = ['Edition Statement',
            'Corporate Author',
            'Corporate Contributors',
            'Former owner',
            'Engraver',
            'Contributors',
            'Issuance type',
            'Shelfmarks']

df.drop(to_drop, inplace=True, axis=1)

In [53]:
df.columns = df.columns.str.replace(' ', '_') 

In [54]:
df.head()

Unnamed: 0,Identifier,Place_of_Publication,Date_of_Publication,Publisher,Title,Author,Flickr_URL
0,206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
1,216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
2,218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
3,472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
4,480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [55]:
## Check if my selected key is unique
df['Identifier'].is_unique

True

In [56]:
df = df.set_index('Identifier')
df.head()

Unnamed: 0_level_0,Place_of_Publication,Date_of_Publication,Publisher,Title,Author,Flickr_URL
Identifier,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
206,London,1879 [1878],S. Tinsley & Co.,Walter Forbes. [A novel.] By A. A,A. A.,http://www.flickr.com/photos/britishlibrary/ta...
216,London; Virtue & Yorston,1868,Virtue & Co.,All for Greed. [A novel. The dedication signed...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
218,London,1869,"Bradbury, Evans & Co.",Love the Avenger. By the author of “All for Gr...,"A., A. A.",http://www.flickr.com/photos/britishlibrary/ta...
472,London,1851,James Darling,"Welsh Sketches, chiefly ecclesiastical, to the...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...
480,London,1857,Wertheim & Macintosh,"[The World in which I live, and my place in it...","A., E. S.",http://www.flickr.com/photos/britishlibrary/ta...


In [57]:
df.loc[216] ##refer by index value

Place_of_Publication                             London; Virtue & Yorston
Date_of_Publication                                                  1868
Publisher                                                    Virtue & Co.
Title                   All for Greed. [A novel. The dedication signed...
Author                                                          A., A. A.
Flickr_URL              http://www.flickr.com/photos/britishlibrary/ta...
Name: 216, dtype: object

In [58]:
df.loc[216, ['Place_of_Publication','Date_of_Publication']] #refer by index value, list of column names

Place_of_Publication    London; Virtue & Yorston
Date_of_Publication                         1868
Name: 216, dtype: object

In [59]:
df.iloc[1, 1:3] ##refer by index position and column positions

Date_of_Publication            1868
Publisher              Virtue & Co.
Name: 216, dtype: object

In [62]:
place = df.Place_of_Publication.unique()

In [65]:
place

array(['London', 'London; Virtue & Yorston',
       'pp. 40. G. Bryan & Co: Oxford, 1898', ...,
       'pp. viii. 64. J. Debrett: London, 1789', 'G. Eld: London, 1608',
       'Newcastle upon Tyne'], dtype=object)

In [67]:
university_towns = []
with open('university_towns.txt') as file:
     for line in file:
            if '[edit]' in line:
             # Remember this `state` until the next is found
                 state = line
            else:
             # Otherwise, we have a city; keep `state` as last-seen
                 university_towns.append((state, line))

university_towns[:5]

[('Alabama[edit]\n', 'Auburn (Auburn University)[1]\n'),
 ('Alabama[edit]\n', 'Florence (University of North Alabama)\n'),
 ('Alabama[edit]\n', 'Jacksonville (Jacksonville State University)[2]\n'),
 ('Alabama[edit]\n', 'Livingston (University of West Alabama)[2]\n'),
 ('Alabama[edit]\n', 'Montevallo (University of Montevallo)[2]\n')]

In [68]:
towns_df = pd.DataFrame(university_towns,
                        columns=['State', 'RegionName'])

In [69]:
towns_df.head()

Unnamed: 0,State,RegionName
0,Alabama[edit]\n,Auburn (Auburn University)[1]\n
1,Alabama[edit]\n,Florence (University of North Alabama)\n
2,Alabama[edit]\n,Jacksonville (Jacksonville State University)[2]\n
3,Alabama[edit]\n,Livingston (University of West Alabama)[2]\n
4,Alabama[edit]\n,Montevallo (University of Montevallo)[2]\n


In [70]:
def get_citystate(item):
     if ' (' in item:
        return item[:item.find(' (')]
     elif '[' in item:
        return item[:item.find('[')]
     else:
        return item

In [71]:
towns_df = towns_df.applymap(get_citystate)

In [72]:
towns_df.head()

Unnamed: 0,State,RegionName
0,Alabama,Auburn
1,Alabama,Florence
2,Alabama,Jacksonville
3,Alabama,Livingston
4,Alabama,Montevallo


**Label Encoding & One Hot Encoding**

In [84]:
dff = pd.read_csv('fruits.csv')
dff

Unnamed: 0,Fruit,Quantity
0,Apple,25
1,Banana,50
2,Orange,30
3,Guava,20
4,Pineapple,10


In [87]:
#Label Encoding
le = LabelEncoder() 
  
dff['Fruit']= le.fit_transform(dff['Fruit']) 
dff

Unnamed: 0,Fruit,Quantity
0,0,25
1,1,50
2,3,30
3,2,20
4,4,10


In [89]:
#OneHot Encoding
columnTransformer = ColumnTransformer([('encoder', 
                                        OneHotEncoder(), 
                                        [0])], 
                                      remainder='passthrough') 
  
dff_data = np.array(columnTransformer.fit_transform(dff), dtype = np.str) 
dff_data

array([['1.0', '0.0', '0.0', '0.0', '0.0', '25.0'],
       ['0.0', '1.0', '0.0', '0.0', '0.0', '50.0'],
       ['0.0', '0.0', '0.0', '1.0', '0.0', '30.0'],
       ['0.0', '0.0', '1.0', '0.0', '0.0', '20.0'],
       ['0.0', '0.0', '0.0', '0.0', '1.0', '10.0']], dtype='<U32')