## Pandas

### Series

In [None]:
import numpy as np
import pandas as pd

ser1 = pd.Series(data=[120, 380, 250, 360], index=['S1', 'S2', 'S3', 'S4'])
ser1

In [None]:
ser2 = pd.Series({'S1': 320, 'S2': 180, 'S3': 300, 'S4': 405})
ser2

In [None]:
ser1 += 10
ser1

In [None]:
ser1 + ser2

In [None]:
ser1[2] == ser1["S3"]

In [None]:
ser2[1:3]

In [None]:
print(ser2.dtype)                    
print(ser2.hasnans)                  
print(ser2.index)                     
print(ser2.values)                    
print(ser2.is_monotonic_increasing)  
print(ser2.is_unique)  

In [None]:
print(ser2.count())   # 
print(ser2.sum())     # 
print(ser2.mean())    # 
print(ser2.median())  # 
print(ser2.max())     # 
print(ser2.min())     # 
print(ser2.std())     # 
print(ser2.var())     # 

In [None]:
ser2.describe()

In [None]:
ser3 = pd.Series(data=['apple', 'banana', 'apple', 'pitaya', 'apple', 'pitaya', 'durian'])
ser3.value_counts()

In [None]:
ser3.nunique()

In [None]:
ser4 = pd.Series(data=[10, 20, np.nan, 30, np.nan])
ser4.isna()

In [None]:
ser4.dropna()

In [None]:
ser4.fillna(value=40)  

In [None]:
ser7 = pd.Series([20, 21, 12],  index=['London', 'New York', 'Helsinki'])
ser7

In [None]:
ser7.apply(np.square)

In [None]:
ser7.apply(lambda x, value: x - value, args=(5, ))

In [None]:
ser8 = pd.Series(
    data=[35, 96, 12, 57, 25, 89], 
    index=['grape', 'banana', 'pitaya', 'apple', 'peach', 'orange']
)
ser8.sort_values()  

In [None]:
ser8.sort_index(ascending=False)

In [None]:
ser8.nlargest(3)

In [None]:
import matplotlib.pyplot as plt

ser9 = pd.Series({'Q1': 400, 'Q2': 520, 'Q3': 180, 'Q4': 380})

ser9.plot(kind='bar')

plt.ylim(0, 600)

plt.xticks(rotation=0)

for i in range(ser9.size):
    plt.text(i, ser9.iloc[i] + 5, ser9.iloc[i], ha='center')
plt.show()


In [None]:
ser9.plot(kind='pie', autopct='%.1f%%', pctdistance=0.65)
plt.show()

### Dataframe

In [None]:
scores = np.random.randint(60, 101, (5, 3))
courses = ['English', 'Math', 'History']
stu_ids = np.arange(1001, 1006)
df1 = pd.DataFrame(data=scores, columns=courses, index=stu_ids)
df1

In [None]:
scores = {
    'English': [62, 72, 93, 88, 93],
    'Math': [95, 65, 86, 66, 87],
    'History': [66, 75, 82, 69, 82],
}
stu_ids = np.arange(1001, 1006)
df2 = pd.DataFrame(data=scores, index=stu_ids)
df2

In [None]:
pd.read_csv("res/annual-enterprise-survey-2021-financial-year-provisional-csv.csv")

In [None]:
!pip install xlrd # need this lib to open xls files

In [None]:
df = pd.read_excel("res/file_example_XLS_10.xls", sheet_name="Sheet1")
df

In [None]:
df.head()

In [None]:
df.drop(columns=0, inplace=True)
df.set_index("Id", inplace=True)
df

In [None]:
df.Date

In [None]:
df["First Name"]

In [None]:
df.iloc[1]

In [None]:
df.loc[2468]

In [None]:
df[df["Age"] > 30]

In [None]:
df[(df["Age"] < 30) & (df.index>2000)]

In [None]:
df.query("Gender=='Female' and Age>30")

In [None]:
df[["Gender", "Age"]].groupby("Gender").mean()

In [None]:
df.groupby(["Country", "Gender"]).count()

In [None]:
temp = pd.pivot_table(df, index='Country', values=['Age'], aggfunc='sum')
temp

In [None]:
temp.plot(figsize=(8, 4), kind='bar')
plt.xticks(rotation=0)
plt.show()

In [None]:
temp.sort_values(by='Age', ascending=False).plot(
    figsize=(6, 6),
    kind='pie',
    y='Age',
    ylabel='',
    autopct='%.2f%%',
    pctdistance=0.8,
    wedgeprops=dict(linewidth=1, width=0.35),
    legend=False
)
plt.show()

In [None]:
!pip install openpyxl # need this for xlsx file

In [None]:
baidu_df = pd.read_excel('res/stock.xlsx', sheet_name='BIDU')
baidu_df

In [None]:
baidu_df.set_index("Date", inplace=True)
baidu_df.sort_index(inplace=True)
baidu_df


In [None]:
baidu_df.rolling(5).mean()

In [None]:
close_ma10 = baidu_df.Close.rolling(10).mean()
result_df = pd.merge(baidu_df.Close, close_ma10, left_index=True, right_index=True)
result_df.rename(columns={'Close_x': 'Close', 'Close_y': 'MA10'}, inplace=True)
result_df.plot(kind='line', figsize=(10, 6))
plt.show()