## Python : Pandas 
#### 1.Pandas : Series creation and indexing

In [2]:
import pandas as pd

In [3]:
grades = pd.Series(range(80,100,2))
print(grades)

0    80
1    82
2    84
3    86
4    88
5    90
6    92
7    94
8    96
9    98
dtype: int64


In [4]:
# index : left aligned, elements : right aligned
# key becomes column index, which means that each column is a Series

season_temps = pd.DataFrame({'Spring': [10,14,18], 'Summer':[24,27,30],
                            'Fall':[24,21,18],'Winter':[8,0,-5]})
print(season_temps)

   Spring  Summer  Fall  Winter
0      10      24    24       8
1      14      27    21       0
2      18      30    18      -5


In [5]:
scores = {'Kim':[87,96,70],'Park':[100,87,90],'Sam':[90,77,90],\
         'Kwon':[100,90,95],'Lee':[83,65,85]}
scores_df = pd.DataFrame(scores)
print(scores_df)

   Kim  Park  Sam  Kwon  Lee
0   87   100   90   100   83
1   96    87   77    90   65
2   70    90   90    95   85


In [6]:
#Changing index
scores_ni = pd.DataFrame(scores, index = ['Math','Econ','Physics'])
print(scores_ni)
scores_df.index = ['Math','Econ','Physics']
print(scores_df)

         Kim  Park  Sam  Kwon  Lee
Math      87   100   90   100   83
Econ      96    87   77    90   65
Physics   70    90   90    95   85
         Kim  Park  Sam  Kwon  Lee
Math      87   100   90   100   83
Econ      96    87   77    90   65
Physics   70    90   90    95   85


#### 3. Pandas : DataFrame slicing
- loc : selecting rows with row name
- iloc : selecting rows with index number
- at : getting a specific element of a DataFrame
- iat : getting a specific element of a DataFrame

In [8]:
print(f'{season_temps.loc[0]}\n{season_temps.iloc[0]}')

Spring    10
Summer    24
Fall      24
Winter     8
Name: 0, dtype: int64
Spring    10
Summer    24
Fall      24
Winter     8
Name: 0, dtype: int64


In [9]:
#slicing by setting row index range
print(f'{scores_ni.loc["Math":"Econ"]}\n{scores_df.iloc[:2,:3]}')

      Kim  Park  Sam  Kwon  Lee
Math   87   100   90   100   83
Econ   96    87   77    90   65
      Kim  Park  Sam
Math   87   100   90
Econ   96    87   77


In [12]:
#slicing with specific index indexes
print(f'{scores_ni.loc[["Math","Physics"]]}\n{scores_df.iloc[[0,2],:3]}')

         Kim  Park  Sam  Kwon  Lee
Math      87   100   90   100   83
Physics   70    90   90    95   85
         Kim  Park  Sam
Math      87   100   90
Physics   70    90   90


In [13]:
#slicing with a row index range and column names or indexes
print(f'{scores_ni.loc["Math":"Physics",["Kim","Sam"]]}\n{scores_df.iloc[[0,2],0:3]}')

         Kim  Sam
Math      87   90
Econ      96   77
Physics   70   90
         Kim  Park  Sam
Math      87   100   90
Physics   70    90   90


In [14]:
#selecting specific rows and columns, which are not consecutive
print(f'{scores_ni.loc[["Math","Physics"],["Kim","Sam"]]}')

         Kim  Sam
Math      87   90
Physics   70   90


In [15]:
print(scores_df)
print(scores_df.at['Econ','Kwon'],scores_df.iat[1,3])

         Kim  Park  Sam  Kwon  Lee
Math      87   100   90   100   83
Econ      96    87   77    90   65
Physics   70    90   90    95   85
90 90


#### 4. Pandas : Boolean indexing

In [16]:
scores_df[scores_df >= 90]

Unnamed: 0,Kim,Park,Sam,Kwon,Lee
Math,,100.0,90.0,100,
Econ,96.0,,,90,
Physics,,90.0,90.0,95,


In [17]:
scores_df[(scores_df <90) &(scores_df > 70)]

Unnamed: 0,Kim,Park,Sam,Kwon,Lee
Math,87.0,,,,83.0
Econ,,87.0,77.0,,
Physics,,,,,85.0


#### 5. Pandas : Descriptive statistcs

In [18]:
pd.set_option('precision',3) #소숫점 셋째자리까지
print(scores_df.describe())

          Kim     Park     Sam   Kwon     Lee
count   3.000    3.000   3.000    3.0   3.000
mean   84.333   92.333  85.667   95.0  77.667
std    13.204    6.807   7.506    5.0  11.015
min    70.000   87.000  77.000   90.0  65.000
25%    78.500   88.500  83.500   92.5  74.000
50%    87.000   90.000  90.000   95.0  83.000
75%    91.500   95.000  90.000   97.5  84.000
max    96.000  100.000  90.000  100.0  85.000


In [19]:
scores_df.mean()

Kim     84.333
Park    92.333
Sam     85.667
Kwon    95.000
Lee     77.667
dtype: float64

#### 6. Pandas : Transposing

In [20]:
scores_df.T #행과 열 바뀜

Unnamed: 0,Math,Econ,Physics
Kim,87,96,70
Park,100,87,90
Sam,90,77,90
Kwon,100,90,95
Lee,83,65,85


In [21]:
scores_df.T.describe()

Unnamed: 0,Math,Econ,Physics
count,5.0,5.0,5.0
mean,92.0,83.0,86.0
std,7.714,12.186,9.618
min,83.0,65.0,70.0
25%,87.0,77.0,85.0
50%,90.0,87.0,90.0
75%,100.0,90.0,90.0
max,100.0,96.0,95.0


#### 7. Pandas : Sorting by index and values

In [22]:
season_temps.sort_index(ascending = False) # ascending=False : Descending 

Unnamed: 0,Spring,Summer,Fall,Winter
2,18,30,18,-5
1,14,27,21,0
0,10,24,24,8


In [23]:
scores_df.sort_index() 

Unnamed: 0,Kim,Park,Sam,Kwon,Lee
Econ,96,87,77,90,65
Math,87,100,90,100,83
Physics,70,90,90,95,85


In [24]:
scores_df.sort_index(axis=1) 

Unnamed: 0,Kim,Kwon,Lee,Park,Sam
Math,87,100,83,100,90
Econ,96,90,65,87,77
Physics,70,95,85,90,90


In [25]:
scores_df.sort_values(by = 'Econ',axis=1,ascending = False) #econ성적을 기준으로 

Unnamed: 0,Kim,Kwon,Park,Sam,Lee
Math,87,100,100,90,83
Econ,96,90,87,77,65
Physics,70,95,90,90,85


In [28]:
scores_df.T.sort_values(by = 'Econ',ascending = False)

Unnamed: 0,Math,Econ,Physics
Kim,87,96,70
Kwon,100,90,95
Park,100,87,90
Sam,90,77,90
Lee,83,65,85


#### 8. Pandas : One hot vector
- One hot vector : among the elements of a vector, only one element has 1 and others have 0.

In [36]:
auto_firms = ['Hundai','Honda','Kia','Audi','Benz','Hundai','Benz','Audi',' Hundai', 
              'Kia','Honda','Kia','Audi','Hundai','Benz']
Year = list(range(1990,2005,1))
Rank = list(range(15))
auto_df = pd.DataFrame({'Year':Year,'Rank':Rank,'Marker':auto_firms})
print(auto_df)

    Year  Rank   Marker
0   1990     0   Hundai
1   1991     1    Honda
2   1992     2      Kia
3   1993     3     Audi
4   1994     4     Benz
5   1995     5   Hundai
6   1996     6     Benz
7   1997     7     Audi
8   1998     8   Hundai
9   1999     9      Kia
10  2000    10    Honda
11  2001    11      Kia
12  2002    12     Audi
13  2003    13   Hundai
14  2004    14     Benz


In [40]:
am_onehot = pd.get_dummies(auto_df['Marker'])
print(am_onehot)

     Hundai  Audi  Benz  Honda  Hundai  Kia
0         0     0     0      0       1    0
1         0     0     0      1       0    0
2         0     0     0      0       0    1
3         0     1     0      0       0    0
4         0     0     1      0       0    0
5         0     0     0      0       1    0
6         0     0     1      0       0    0
7         0     1     0      0       0    0
8         1     0     0      0       0    0
9         0     0     0      0       0    1
10        0     0     0      1       0    0
11        0     0     0      0       0    1
12        0     1     0      0       0    0
13        0     0     0      0       1    0
14        0     0     1      0       0    0
