## Lecture 0 - Pandas Basics

---
- pandas.Series<br>
    One dimensional array, typed dictionary. Kan skapas från dictionary, list 

- pandas.DataFrame
- read_csv
- indexing
- plotting

In [9]:
import pandas as pd
import matplotlib.pyplot as plt

data = dict(Index = "Värde", AI = 25, NET = 38, Java = 30, UX = 28)

programs_series = pd.Series(data)

print(programs_series) # Series __repr__


Index    Värde
AI          25
NET         38
Java        30
UX          28
dtype: object


In [27]:
# extract values
print(f"{programs_series[1]=}")
print(f"{programs_series[-1]=}")

# get keys
print(f"{programs_series.keys()=}")

programs_series[1]=25
programs_series[-1]=28
programs_series.keys()=Index(['Index', 'AI', 'NET', 'Java', 'UX'], dtype='object')


In [25]:
import random as rnd

rnd.seed(72) # Random seed generation

dice_series = pd.Series([rnd.randint(1,6) for _ in range(10)])
print(dice_series.head(3))

0    1
1    5
2    6
dtype: int64


## Statistics methods

Bygger på numpy functions, bör ej loopas

In [28]:
dice_series.min()

1

In [29]:
dice_series.argmin()

0

In [30]:
dice_series.max()

6

In [31]:
dice_series.mean()

3.9

In [32]:
dice_series.median()

4.0

---
## df = DataFrame

- tabular data with rows and columns
- analog to 2D numpy arrays 
 -  with flexible row indices and column names
 -  more like excel, tabular etc.

specialized dictionary with column name mapped to series object

df_ convention


In [38]:
# instantiate DataFrame from as Series object
df_programs = pd.DataFrame(programs_series, columns=("Number_of_students",))

df_programs
# fem rader, en kolumn

Unnamed: 0,Number_of_students
Index,Värde
AI,25
NET,38
Java,30
UX,28


In [40]:
#create 2 Series object to creade new df
students = pd.Series({"AI": 26, "NET": 38, "UX": 28, "Java": 30})
skills = pd.Series({"AI": "Python", "NET": "C#", "UX": "Figma", "Java": "Java"})

# create 
df_programs = pd.DataFrame({"Students": students, "Skills": skills})
df_programs

# Varje kolumnamn mappat till pd.series (panda series objekt)


Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


In [41]:
median_student_number = df_programs["Students"].median()
print(f"Median students in programs {df_programs.index.to_list()} is {median_student_number:.0f}")

Median students in the programs ['AI', 'NET', 'UX', 'Java'] is 29


## Indexers

 - loc - slicing and indexing using explicit index
 - iloc - python type slicing [:::]

In [42]:
df_programs["Skills"]

AI      Python
NET         C#
UX       Figma
Java      Java
Name: Skills, dtype: object

In [44]:
df_programs.loc["AI"] # Returs Series object

Students        26
Skills      Python
Name: AI, dtype: object

In [59]:
df_programs.iloc[0:3] # Returs DataFrame object

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma


## Masking

In [60]:
df_programs

Unnamed: 0,Students,Skills
AI,26,Python
NET,38,C#
UX,28,Figma
Java,30,Java


In [48]:
df_programs_over_29 = df_programs[df_programs["Students"] >=30]
df_programs_over_29 # Does not mutate DataFrame

Unnamed: 0,Students,Skills
NET,38,C#
Java,30,Java


---

## Microsoft Excel data

In [62]:
import seaborn as sns

df = pd.read_excel("../Data/calories.xlsx")

df.head()

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2225 entries, 0 to 2224
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   FoodCategory      2225 non-null   object
 1   FoodItem          2225 non-null   object
 2   per100grams       2225 non-null   object
 3   Cals_per100grams  2225 non-null   object
 4   KJ_per100grams    2225 non-null   object
dtypes: object(5)
memory usage: 87.0+ KB


In [65]:
df.shape

(2225, 5)

## Data cleaning & exploration

- type convert string objects with numercial values to int
- change column names
- separate into liquids and solids

In [66]:
df.head()

Unnamed: 0,FoodCategory,FoodItem,per100grams,Cals_per100grams,KJ_per100grams
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ


In [70]:
# renaming columns
df = df.rename(dict(Cals_per100grams = "Calories", KJ_per100grams = "KJ", per100grams = "per100"), axis = "columns")

df.head()

Unnamed: 0,FoodCategory,FoodItem,per100,Calories,KJ
0,CannedFruit,Applesauce,100g,62 cal,260 kJ
1,CannedFruit,Canned Apricots,100g,48 cal,202 kJ
2,CannedFruit,Canned Blackberries,100g,92 cal,386 kJ
3,CannedFruit,Canned Blueberries,100g,88 cal,370 kJ
4,CannedFruit,Canned Cherries,100g,54 cal,227 kJ
