# Python

---

# List
### Indexing

In [2]:
a_list = ["Python", "Data", "Science"]

a_list[0]

'Python'

### List slicing

In [3]:
a_list[:2]

['Python', 'Data']

---

# Tuple
### Indexing

In [4]:
a_tuple = ("Python", "Data", "Science")

a_tuple[0]

'Python'

### slicing

In [5]:
a_tuple[:2]

('Python', 'Data')

## Unpacking

In [6]:
item_0, item_1, item_2 = ("Python", "Data", "Science")

item_0

'Python'

In [7]:
item_0, (item_1, item_2) = "Python", ("Data", "Science")

item_1

'Data'

In [8]:
fig, (plot1, plot2) = "Figure", ("Plot1", "Plot2")

plot1

'Plot1'

In [9]:
fig, ((plot1, plot2),(plot3, plot4)) = "Figure", (("Plot1", "Plot2"), ("Plot3", "Plot4"))

plot2

'Plot2'

---

## Dictionaries

#### Create a dictionary

In [10]:
d = {"color": "mediumblue", "linestyle": "dashed"}

#### Access a dictionary

In [11]:
d["color"]

'mediumblue'

---

## for Loops

In [21]:
for item in ["item 1", "item 2", "item 3"]:
    print(item)

item 1
item 2
item 3


---

## zip

In [23]:
# zip combines the respective items from each list as a tuple

list(zip(["Data", "Machine", "Artificial"], ["Science", "Learning", "Intelligence"]))

[('Data', 'Science'), ('Machine', 'Learning'), ('Artificial', 'Intelligence')]

### Unpacking zipped items

In [24]:
# Unpacking the zipped tuples as we loop through

for item1, item2 in zip(["Data", "Machine", "Artificial"], ["Science", "Learning", "Intelligence"]):
    print(item1, item2)

Data Science
Machine Learning
Artificial Intelligence


#### What the machine sees:

In [25]:
                    # after being zipped into a list of tuples...
for item1, item2 in [('Data', 'Science'), ('Machine', 'Learning'), ('Artificial', 'Intelligence')]:
    print(item1, item2)

Data Science
Machine Learning
Artificial Intelligence


### Example

In [26]:
# Unpacking the created list of tuples as we loop through each zipped item

for box, color in zip(["box1", "box2", "box3"], ["lightblue", "mediumblue", "darkblue"]):
    print(box, color)

box1 lightblue
box2 mediumblue
box3 darkblue


#### What the machine sees:

In [27]:
for box, color in [('box1', 'light blue'), ('box2', 'medium blue'), ('box3', 'darkblue')]:
    print(box, color)

box1 light blue
box2 medium blue
box3 darkblue


---

## Function

#### Define the function

In [36]:
def my_plot():
    return "my data viz..."

#### Call the function

In [38]:
# No parameters

my_plot()

'my data viz...'

### Using parameters

In [43]:
def my_plot(plot_type, color):
    return f"My {color} {plot_type}"

#### Call the function

In [44]:
# Providing two required values

my_plot("histogram", "skyblue")

'My skyblue histogram'

### Using default values

In [50]:
def my_plot(plot_type, color="mediumblue"):
    return f"My {color} {plot_type}"

In [46]:
# Using the default value for color

my_plot("histogram")

'My mediumblue histogram'

### Providing an optional color value

In [52]:
my_plot("histogram", "lightblue")

'My lightblue histogram'

---

---

# Pandas

In [91]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/')
for item in os.listdir():
  print(item)
print("--------------")

os.chdir('/content/drive/MyDrive/cloud/GitHub/AdvDataViz/Notebooks/')
for item in os.listdir():
  print(item)

print("--------------")

notebooks = "/content/drive/MyDrive/cloud/GitHub/AdvDataViz/Notebooks"
print(os.listdir(notebooks))
print("--------------")

file = "heart-disease.csv"
file_path = os.path.join(notebooks, file)
with open(file_path, "r") as f:
  contents = f.read()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
learningStore
healthyCar
startup
Artificial Intelligence
cloud
--------------
heart-disease.csv
student_performance.csv
churn.csv
employee_attrition_.csv
Top 50 US Tech Companies.csv
01 Python_Pandas.ipynb
02 Matplotlib.ipynb
03 Matplotlib  - Exercise.ipynb
03 Matplotlib - Exercise Solutions.ipynb
04 Continuous Variables - Histogram .ipynb
05 Continuous Variables - Histogram - Exercise Solutions.ipynb
05 Continuous Variables - Histogram - Exercise .ipynb
06 Continuous Variables - Boxplot.ipynb
07 Continuous Variables - Boxplot - Exercise .ipynb
07 Continuous Variables - Boxplot - Exercise Solutions.ipynb
08 Continuous Variables - Scatterplot.ipynb
09 Continuous Variables - Scatterplot - Exercise .ipynb
09 Continuous Variables - Scatterplot - Exercise Solutions.ipynb
11 Pandas Data Visualization.ipynb
12 Seaborn.ipynb
13 Seaborn - Exercise .ipynb
13 Seaborn - 

In [60]:
import pandas as pd

---

# Data

## DataFrame object

In [92]:
#df = pd.read_csv("heart-disease.csv")

df = pd.read_csv(file_path)
print(df.head())


   age     sex  chest_pain  rest_bp  chol  max_hr  st_depr  heart_disease
0   63  female           3      145   233     150      2.3              1
1   37  female           2      130   250     187      3.5              1
2   41    male           1      130   204     172      1.4              1
3   56  female           1      120   236     178      0.8              1
4   57    male           0      120   354     163      0.6              1


### Preview dataset

In [93]:
# show first 5 rows

df.head()

Unnamed: 0,age,sex,chest_pain,rest_bp,chol,max_hr,st_depr,heart_disease
0,63,female,3,145,233,150,2.3,1
1,37,female,2,130,250,187,3.5,1
2,41,male,1,130,204,172,1.4,1
3,56,female,1,120,236,178,0.8,1
4,57,male,0,120,354,163,0.6,1


### Access a column

In [94]:
# Dictionary notation

df['sex']

Unnamed: 0,sex
0,female
1,female
2,male
3,female
4,male
...,...
298,male
299,female
300,female
301,female


### Unique values

In [95]:
df["sex"].unique()

array(['female', 'male'], dtype=object)

---

## Selection and Filtering
### Column selection

In [96]:
df[['age', 'sex', 'heart_disease']]    # providing a list selects multiple columns

Unnamed: 0,age,sex,heart_disease
0,63,female,1
1,37,female,1
2,41,male,1
3,56,female,1
4,57,male,1
...,...,...,...
298,57,male,0
299,45,female,0
300,68,female,0
301,57,female,0


### Row and Column selection with loc
Allows you to select a subset of the rows and columns using the label/name of the row/column

In [97]:
# loc (selection is inclusive) implies the name/label of the row, column

df.loc[:5, ["age", "sex"]]

Unnamed: 0,age,sex
0,63,female
1,37,female
2,41,male
3,56,female
4,57,male
5,57,female


---

## Boolean row selection

In [98]:
df["sex"]=="female"

Unnamed: 0,sex
0,True
1,True
2,False
3,True
4,False
...,...
298,False
299,True
300,True
301,True


### Using boolean for row selection

In [99]:
# row selection (return the rows that are True),  col selection

df.loc[df["sex"]=="female", ["age", "sex"]]

Unnamed: 0,age,sex
0,63,female
1,37,female
3,56,female
5,57,female
7,44,female
...,...,...
295,63,female
297,59,female
299,45,female
300,68,female


### & (and)

In [100]:
                    # row selection,                        col selection

df.loc[(df["sex"]=="female") & (df["age"] > 65), ["sex", "age", "heart_disease"]]

Unnamed: 0,sex,age,heart_disease
51,female,66,1
86,female,68,1
106,female,69,1
145,female,70,1
150,female,66,1
165,female,67,0
166,female,67,0
197,female,67,0
203,female,68,0
225,female,70,0


## Binning
#### Convert a **continuous or interval** variable to a **categorical** variable.

In [101]:
df["age"].head(10)

Unnamed: 0,age
0,63
1,37
2,41
3,56
4,57
5,57
6,56
7,44
8,52
9,57


In [102]:
                     # bounds: (29, 39], (39, 49], (49, 59],(59, 69],(69, 79]

df["age"] = pd.cut(df["age"], [29, 39, 49, 59, 69, 79], labels=["thirties","forties","fifties","sixties", "seventies"])
df["age"].head(10)

Unnamed: 0,age
0,sixties
1,thirties
2,forties
3,fifties
4,fifties
5,fifties
6,fifties
7,forties
8,fifties
9,fifties


---

## Useful methods

### mean()

In [104]:
df["max_hr"].mean()

149.64686468646866

### median()

In [105]:
df["max_hr"].median()

153.0

### count()
#### Returns the number of rows included in a selection

In [106]:
# Count the number of ages (rows)

df["age"].count()

302

In [107]:
# Count the number of rows where the "age" is "forties"
# Count the number of rows that are "True"

df.loc[df["age"] == "forties", "age"].count()

72

### value_counts()
#### Returns the count of each unique category in a column

In [108]:
# Count the quantity of each unique category for "age"

df["age"].value_counts()

Unnamed: 0_level_0,count
age,Unnamed: 1_level_1
fifties,125
sixties,80
forties,72
thirties,15
seventies,10


---