## SQL Class Testing

In [1]:
from sql import SQL
import pandas as pd
import re

### Data cleaning 

In [2]:
def extract_number(s):
    match = re.search(r"\d+\.?\d*", s) 
    return round(float(match.group()) / 100, 4)

# Clean locations.csv
df = pd.read_csv('data/locations.csv')[['name', 'longitude', 'latitude']].iloc[2:].reset_index(drop=True)
df.to_csv('data/locations.csv', index=False)

# Clean countries.csv
try:
    df = pd.read_csv('data/countries.csv').rename(columns={'Country or Area':'country', 
                                                        'Internet Users':'users', 
                                                        'Population':'population', 
                                                        'Rank':'rank',
                                                        'Percentage':'percentage'})[['country', 'users', 'population', 'rank', 'percentage']]
    df['users'] = df['users'].str.replace(',', '').astype(int)
    df['population'] = df['population'].str.replace(',', '').astype(int)
    df['percentage'] = df['percentage'].apply(extract_number)
    df.to_csv('data/countries.csv', index=False)
except:
    df = pd.read_csv('data/countries.csv')
    df['percentage'] = df['percentage'].round(4)
    df.to_csv('data/countries.csv', index=False)

### Testing

In [3]:
# add_table
database = SQL()
database.add_table('data/countries.csv', 'countries', ',')
database.add_table('data/locations.csv', 'locations', ',')

In [None]:
# Test: sql_from
table1, mod_name1 = database.sql_from('FROM countries')
table2, mod_name2 = database.sql_from('FROM locations')

for col in table1.keys():
    print(col, table1[col][:3])
print(mod_name1)

print()

for col in table2.keys():
    print(col, table2[col][:3])
print(mod_name2)

In [None]:
# Test: sql_join

table3 = database.sql_join('JOIN locations ON countries.country = locations.name', 'countries', mod_name1)
table4 = database.sql_join('JOIN locations l ON c.country = l.name', 'countries', 'c')
table5 = database.sql_join('JOIN locations AS l ON countries.country = l.name', 'countries', mod_name1)

for col in table3.keys():
    print(col, table3[col][:3])

print()

for col in table4.keys():
    print(col, table4[col][:3])

print()

for col in table5.keys():
    print(col, table5[col][:3])