### Uploading DataFrame to SQL DB in AWS
We already scraped the Car Data, now we are going to clean it and make some adjustments to upload it to a Cloud server SQL

Steps:
* import DF from scrapper
* clean data
* Verify columns names
* Upload to DB

In [1]:
import pandas as pd
import math
import psycopg2 as ps
import numpy as np
from dotenv import load_dotenv
import os

In [6]:
df = pd.read_csv('scrap_final.csv')


In [7]:
df = df.rename(columns={df.columns[9]: 'Tipo de carr'})
df.groupby('Marca').size()

Marca
Acura                        1
Alfa Romeo                 226
Aro                          1
Audi                      1999
Audi A1                      2
                          ... 
Volvo                      164
bmw audi mercedes benz       3
fiat 600 82                  2
izuzu                        1
mercedes                     1
Length: 94, dtype: int64

We see that there is a problem with the column names which endes up in NaN values were in reallity there are 3 columns that are the same (This is because of character encoding)
This is why we rename all columns into english name to prevent compatibility issues

In [8]:
new_df = pd.DataFrame()

In [15]:
new_df['id'] = df.index
new_df['brand'] = df['Marca']
new_df['model'] = df['Modelo']
new_df['colour'] = df['Color']
new_df['fuel'] = df['Tipo de combustible']
new_df['doors'] = df['Puertas']
new_df['engine'] = df['Motor']
new_df['location'] = df['Location']
new_df['price'] = df['Price']
new_df['year'] = df['Año'].fillna(df['AÃ±o'])
if 'TransmisiÃ³n' in df:
    new_df['transmision'] = df['Transmisión'].fillna(df['TransmisiÃ³n'])
else:
    new_df['transmision'] = df['Transmisión']
new_df['km'] = df['Kilómetros'].fillna(df['KilÃ³metros'])
new_df['type'] = df['Tipo de carrocería'].fillna(df['Tipo de carr'])
new_df['url'] = df['Link']

In [16]:
new_df.isnull().any()

id             False
brand          False
model          False
colour          True
fuel           False
doors          False
engine          True
location       False
price          False
year           False
transmision     True
km             False
type            True
url            False
dtype: bool

As we see there are some missing values, but at least every car has the main features

We also check for duplicates in the URL

In [17]:
new_df.duplicated(subset="id").any()

False

In [18]:
new_df = new_df.drop_duplicates(subset="id")

### As we see we also have a problem with encoding, 2 values ended up beeing 4
#### Sure there is a library to handle encoding, I triend .encode and .decode but didnt work. So I decided to replace it myself

In [19]:
new_df["transmision"].unique()

array([nan, 'Automática', 'Manual', 'Automática secuencial'], dtype=object)

In [20]:
new_df["transmision"] = new_df["transmision"].str.replace("Ã¡", "á",regex=False)
new_df["fuel"] = new_df["fuel"].str.replace("Ã©", "é",regex=False)
new_df["fuel"] = new_df["fuel"].str.replace("\\", "í",regex=False)


 Also I found this, to normalize and get rid of accents

In [21]:
# From https://stackoverflow.com/a/518232
import unicodedata
def strip_accents(s):
   return ''.join(c for c in unicodedata.normalize('NFD', s)
                  if unicodedata.category(c) != 'Mn')

In [22]:
def series_stripper(serie):
    return serie.apply(lambda x:strip_accents(x))

In [23]:
new_df['fuel'] = series_stripper(new_df['fuel'])
new_df['location'] = series_stripper(new_df['location'])
new_df['transmision'] = series_stripper(new_df['transmision'].fillna(''))
new_df['colour'] = series_stripper(new_df['colour'].fillna('')).str.replace("A³","o",regex=False)
new_df['type'] = series_stripper(new_df['type'].fillna(''))
new_df["brand"] = new_df["brand"].str.replace("Ã«","e",regex=False).str.replace("ë","e",regex=False).str.replace("ÃƒÂ«","e",regex=False)
new_df["fuel"] = new_df["fuel"].str.replace("A\xad","i",regex=False)
new_df["km"] = new_df["km"].str.replace(" km","",regex=False).astype(np.int64)
new_df["type"] = new_df["type"].str.replace("A¡","a",regex=False).str.replace("A³","o",regex=False).str.replace("A©","e",regex=False)
new_df["location"] = new_df["location"].str.replace("A¡","a",regex=False).str.replace("A³","o",regex=False).str.replace("A©","e",regex=False).str.replace("A\xad","i",regex=False).str.replace("A±","n",regex=False).str.replace("Aº","u",regex=False).str.replace("A¼","u",regex=False)
new_df["year"] = new_df["year"].astype(np.int64)
new_df[new_df["km"]>1000000]=1000000 #This prevents SQL to get int out of range
#Index(['brand', 'model', 'colour', 'fuel', 'doors', 'engine', 'location',
 #      'price', 'year', 'transmision', 'km', 'type', 'url'],
  #    dtype='object')

In [24]:
unique = []
unique = new_df.apply(lambda x : x.unique())
unique["brand"]

array(['Alfa Romeo', 'Audi', 'Volkswagen', 'Ford', 'RAM', 'BMW',
       'Audi Q5', 'Citroen', 'Audi A1', 'Renault', 'Audi A3',
       'bmw audi mercedes benz', 'Mercedes-Benz', 'Kia', 'VW', 'Mini',
       'Acura', 'BMW 640I COUPE', 'Chery', 1000000, 'Chevrolet',
       'Peugeot', 'Jeep', 'Chrysler', 'Dodge', 'Crysler', 'DS',
       'Dodge Fargo', 'Dacia', 'Fiat', 'FORD V8 4X4', 'Fordd', 'Honda',
       'Isuzu', 'Hyundai', 'Jeep ika', 'Toyota', 'Ika', 'Mahindra',
       'Daihatsu', 'Land Rover', 'Land rover defender', 'Mazda',
       'Mercedes Benz', 'MERCEDEZ BENZ', 'Mini Cooper', 'Lifan', 'Suzuki',
       'Pontiac TransSport', 'Mitsubishi', 'Nissan', 'Peugeot  208 like',
       'Porsche', 'Smart', 'Subaru', 'Suzuki LJ80', 'Suzuki 1995',
       'Volvo', 'Chevrolet Classic', 'CRYSLER TONW & COUNTRY',
       'Chrysler stratus lx', 'Citroen c4 feel pack', 'CitroÂ´n',
       'Citroen C4 CACTUS', 'Dogde', 'Honda legend', 'Hyunday', 'JMC',
       'izuzu', 'Jeep ika continental', 'IKA', 'Ika 

### Now we got a much cleaner DataFrame

In [25]:
new_df.columns

Index(['id', 'brand', 'model', 'colour', 'fuel', 'doors', 'engine', 'location',
       'price', 'year', 'transmision', 'km', 'type', 'url'],
      dtype='object')

##### We check the types of our DF and translate to SQL types

In [26]:
new_df.dtypes

id              int64
brand          object
model          object
colour         object
fuel           object
doors           int64
engine         object
location       object
price           int64
year            int64
transmision    object
km              int64
type           object
url            object
dtype: object

In [27]:
replacements = {
    'object':'varchar',
    'int64':'int',
    'float64':'float'
}

In [28]:
col_str = ", ".join("{} {}".format(n, d) for (n, d) in zip(new_df.columns[1:], new_df.dtypes[1:].replace(replacements)))
"id int PRIMARY KEY, " + col_str

'id int PRIMARY KEY, brand varchar, model varchar, colour varchar, fuel varchar, doors int, engine varchar, location varchar, price int, year int, transmision varchar, km int, type varchar, url varchar'

##### DB connection

In [27]:
load_dotenv()
host_name = os.environ.get('SQL_HOST_NAME')
dbname = "postgres"
port = "5432"
username = os.environ.get('SQL_USER_NAME')
password =  os.environ.get('SQL_PASSWORD')
conn = None

In [28]:
try:
    conn = ps.connect(host=host_name, database=dbname, user=username, password=password, port=port)
except ps.OperationalError as e:
    raise e
else:
    print("Connected!")

Connected!


In [106]:
create_table = (f"""CREATE TABLE IF NOT EXISTS cars ({col_str})""")

In [80]:
conn.autocommit = True
curr = conn.cursor()

In [81]:
curr.execute(create_table)

In [31]:
new_df.to_csv("tosqldb_f.csv", header=new_df.columns, index=False, encoding='utf-8')
# cars_data = cars_data.drop(cars_data[cars_data["id"]>40000].index)
# csv = open("tosqldb.csv", encoding="utf-8")

Save and open to insert into DB

In [84]:
COPY_QUERY = """
COPY cars FROM STDIN WITH CSV HEADER DELIMITER AS ','
"""
curr.copy_expert(sql=COPY_QUERY, file=csv)

In [None]:
curr.execute("SELECT * FROM cars WHERE year=2019")
curr.fetchone()

In [None]:
'AutomÃ¡tica'.encode('latin-1').decode('utf-8')