## Importing CSV file into MySQL via Python - AirBnB Rome Review

### Importing libraries and loading dataset

Libaries needed for this project includes MySQLdb for connecting to MySQL, csv and sys for retriving files from the computer and pandas for manipulating retrieved csv files.

In [2]:
#import libraries
import MySQLdb
import csv
import sys
import pandas as pd

In [3]:
#loading dataset
rome_review = pd.read_csv("C:\\Users\\ofagb\\Machine Learning\\Rome Review.csv", encoding = 'latin')
rome_review.head()

Unnamed: 0,id,host_id,host_name,acceptance_rate,no_of_reviews_30days,last_review,service_score,cleaniness_score,checkin_score,location_score,average_score
0,17629440,68534156,Cristina,0.96,8,7/12/2022,4.74,4.92,4.85,4.65,4.79
1,2737,3047,Elif,0.03,0,5/28/2015,4.8,4.6,4.8,4.4,4.65
2,103870,535822,Fabrizio,1.0,30,8/25/2022,4.11,4.22,4.65,4.4,4.35
3,2903,3280,Andrea,0.03,0,10/31/2017,4.28,4.67,4.59,4.74,4.57
4,104339,543054,Giuseppe,0.0,0,12/2/2017,5.0,5.0,4.83,4.83,4.92


In [46]:
#checking if columns are of appropriate data types
rome_review.dtypes

id                        int64
host_id                   int64
host_name                object
acceptance_rate         float64
no_of_reviews_30days      int64
last_review              object
service_score           float64
cleaniness_score        float64
checkin_score           float64
location_score          float64
average_score           float64
dtype: object

In [47]:
#check if null value exists by asking a True of False question
print('Any missing values?', rome_review.isnull(). values.any())

Any missing values? True


In [49]:
#checking for null values
rome_review.isnull().sum()

id                         0
host_id                    0
host_name                  6
acceptance_rate            0
no_of_reviews_30days       0
last_review             2535
service_score              0
cleaniness_score           0
checkin_score              0
location_score             0
average_score              0
dtype: int64

In [50]:
#keeping all columns but droppubg rows with missing values
rome_review = rome_review.dropna(axis=0)

In [51]:
#checking for duplicate values
print('Are there any duplicates?', rome_review.duplicated().any())

Are there any duplicates? False


In [52]:
#checking if there are still null columns and if columns are of correct datatypes
rome_review.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 18167 entries, 0 to 20706
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id                    18167 non-null  int64  
 1   host_id               18167 non-null  int64  
 2   host_name             18167 non-null  object 
 3   acceptance_rate       18167 non-null  float64
 4   no_of_reviews_30days  18167 non-null  int64  
 5   last_review           18167 non-null  object 
 6   service_score         18167 non-null  float64
 7   cleaniness_score      18167 non-null  float64
 8   checkin_score         18167 non-null  float64
 9   location_score        18167 non-null  float64
 10  average_score         18167 non-null  float64
dtypes: float64(6), int64(3), object(2)
memory usage: 1.7+ MB


### Connecting to MySQL

In [None]:
To connect to SQL xxxxxxx

In [53]:
#connect to database
db = MySQLdb.connect(host="localhost", user="root", password="xxxxxx", database="rome_airbnb")

cursor = db.cursor()

#print db to ensure connection is successful
print(db)

<_mysql.connection open to 'localhost' at 000001A7C7D3CB20>


In [56]:
#reading csv data into MySQL database
csv_data = csv.reader(open("Rome Review.csv"))
header = next(csv_data)

print('Importing the CSV Files')
for row in csv_data:
    print(row)
    cursor.execute(
        "INSERT INTO rome_reviews(id,host_id, host_name,acceptance_rate,no_of_reviews_30days, last_review,service_score,\
        cleaniness_score,checkin_score,location_score,average_score)\
        VALUES (%s, %s, %s, %s,%s, %s, %s, %s,%s,%s, %s)",row)

db.commit()
cursor.close()
print('Done')

Importing the CSV Files
['17629440', '68534156', 'Cristina', '0.96', '8', '7/12/2022', '4.74', '4.92', '4.85', '4.65', '4.79']
['2737', '3047', 'Elif', '0.03', '0', '5/28/2015', '4.8', '4.6', '4.8', '4.4', '4.65']
['103870', '535822', 'Fabrizio', '1', '30', '8/25/2022', '4.11', '4.22', '4.65', '4.4', '4.35']
['2903', '3280', 'Andrea', '0.03', '0', '10/31/2017', '4.28', '4.67', '4.59', '4.74', '4.57']
['104339', '543054', 'Giuseppe', '0', '0', '12/2/2017', '5', '5', '4.83', '4.83', '4.92']
['3079', '3504', 'Laura', '0.3', '2', '4/30/2022', '4.53', '4.62', '4.71', '4.81', '4.67']
['11834', '44552', 'Serena', '0.87', '16', '9/4/2022', '4.77', '4.88', '4.97', '4.98', '4.9']
['12398', '11756', 'Gea', '0.83', '1', '8/10/2022', '4.9', '4.9', '4.95', '4.85', '4.9']
['105153', '546346', 'Giulia', '0.96', '0', '5/21/2014', '5', '4.5', '5', '5', '4.88']
['108035', '559233', 'Franco', '0.76', '6', '7/13/2022', '4.48', '4.7', '4.74', '4.31', '4.56']
['108039', '355604', 'Alessandro & Silvia', '1', 