In [1]:
import pandas as pd
import csv

#### Examples from the slides of the lecture

In [2]:
# creating a dataframe from pandas series
data = {'State': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
 'Year': [2000, 2001, 2002, 2001, 2002, 2003],
 'Population': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
df = pd.DataFrame(data)
df

Unnamed: 0,State,Year,Population
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [3]:
# creating a dataframe from data matrix
data = [['Ohio', 2000, 1.5], ['Ohio', 2001, 1.7], ['Ohio', 2002, 3.6],
        ['Nevada', 2001, 2.4],['Nevada', 2002, 2.9], ['Nevada', 2003, 3.2]]
cols = ['State', 'Year', 'Population']
df = pd.DataFrame(data, columns = cols)
df

Unnamed: 0,State,Year,Population
0,Ohio,2000,1.5
1,Ohio,2001,1.7
2,Ohio,2002,3.6
3,Nevada,2001,2.4
4,Nevada,2002,2.9
5,Nevada,2003,3.2


In [4]:
# printing the number of columns and the number of rows in a dataframe
print("number of columns = ", len(df.columns))
print('number of rows = ', len(df))

number of columns =  3
number of rows =  6


In [5]:
# creating a dataframe using data stored in a csv file

df = pd.read_csv(filepath_or_buffer = 'movies.csv', delimiter=',', 
                 doublequote=True, quotechar='"',na_values = ['na', '-', '.', ''], 
                 quoting=csv.QUOTE_ALL, encoding = "ISO-8859-1")
df

FileNotFoundError: [Errno 2] No such file or directory: 'movies.csv'

In [None]:
df.info() # index & data types
n = 4
dfh = df.head(n)             # get first n rows
dft = df.tail(n)             # get last n rows
dfs = df.describe()          # summary stats cols
top_left_corner_df = df.iloc[:5, :5]

In [None]:
col_set = df.iloc[:, 5:10]
col_set

In [None]:
# print the name of the column and the number of unique values in that column
for col in  df.columns:
    print(col, ' has (', len(df[col].unique()), ') unique values')

In [None]:
dataTypeSeries = df.dtypes
for col_idx in range(len(df.columns)):
    print(df.columns[col_idx], 'has type (', dataTypeSeries[col_idx], ')')

In [None]:
# profiling the table (printing the min, max and average of the attributes witth numerical values)
dataTypeSeries = df.dtypes
for col_idx in range(len(df.columns)):
       if (not (dataTypeSeries[col_idx] == 'object')):
            print(df.columns[col_idx], 'has Min = ', df[df.columns[col_idx]].min(), 
                  'Max = ', df[df.columns[col_idx]].max(), 
                  'Average = ', df[df.columns[col_idx]].mean())


In [None]:
df["director_name"]

In [None]:
df.loc[(df['content_rating'] == 'PG-13').values, ['actor_1_facebook_likes', 'actor_3_facebook_likes', 'budget']]

In [None]:
df.iloc[(df['content_rating'] == 'PG-13').values, [5,7,9]]

In [None]:
idx = df['content_rating'] == 'PG-13'

In [None]:
s = df['color'].isnull()
df.count()

In [None]:
df.iloc[:, [5,7,9]]

In [None]:
cols_set2 = df[df.columns[[5,7,9]]][:]
cols_set2

#### Connecting to a DBMS and writing a query that returns the names of the tables in the database dvdrental

In [None]:
import psycopg2
conn = psycopg2.connect(database='dvdrental', user='postgres', 
        password='postgres', host='127.0.0.1', port=5432) 
cursor = conn.cursor()


In [None]:
sql_str = "SELECT TABLE_NAME FROM INFORMATION_SCHEMA.TABLES WHERE"
sql_str = sql_str + " TABLE_TYPE = \'BASE TABLE\'"
sql_str = sql_str + " AND TABLE_SCHEMA = \'public\' "
cursor.execute(sql_str)
table_names = cursor.fetchall()
table_names

#### Merge (equivalent to join in the SQL)

In [None]:
dd1 = pd.DataFrame({ 'id_dd1': ['1', '2', '3', '4', '5'], 'Feature1': ['A', 'C', 'E', 'G', 'I'], 'Feature2': ['B', 'D', 'F', 'H', 'J']})
dd2 = pd.DataFrame({ 'id_dd2': ['1', '2', '6', '7', '8'], 'Feature1': ['A', 'C', 'O', 'Q', 'S'], 'Feature2': ['B', 'D', 'P', 'R', 'T']})
union_df = pd.concat([dd1, dd2], axis=1)
union_df

In [None]:
df_merge_col = pd.merge(dd1, dd2, left_on='id_dd1', right_on = 'id_dd2')
df_merge_col

In [None]:
df_merge_col = pd.merge(dd1, dd2, left_on='att_dd1', left_on = 'att_dd2'

### Exercises


In [None]:
# read the world.csv into a dataframe df_w
df_w = pd.read_csv(filepath_or_buffer = 'world.csv', delimiter=',', 
                 doublequote=True, quotechar='"',na_values = ['na', '-', '.', ''], 
                 quoting=csv.QUOTE_ALL, encoding = "ISO-8859-1")
df_w

In [None]:
# read the bbc.csv into a dataframe df_bbc
df_bbc = pd.read_csv(filepath_or_buffer = 'bbc.csv', delimiter=',', 
                 doublequote=True, quotechar='"',na_values = ['na', '-', '.', ''], 
                 quoting=csv.QUOTE_ALL, encoding = "ISO-8859-1")
df_bbc

In [None]:
# Find the total popuation of the European countries according to the world table
sum(df_w[df_w['continent'] == 'Europe']['population'])

In [None]:
# Another solution
df_w[df_w['continent'] == 'Europe']['population'].sum()

In [None]:
# read the table game.csv
df_game = pd.read_csv(filepath_or_buffer = 'game.csv', delimiter=',', 
                 doublequote=True, quotechar='"',na_values = ['na', '-', '.', ''], 
                 quoting=csv.QUOTE_ALL, encoding = "ISO-8859-1")
df_game

In [None]:
# Retur the information about the games in which the Dutch team participated 
df_game[(df_game['team1']=="NED" )| (df_game['team2']=="NED" )]