<a href="https://colab.research.google.com/github/kondimidi/data-science-boot/blob/main/02_analiza_danych/01_pandas_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Pandas
>Strona biblioteki: [https://pandas.pydata.org/](https://pandas.pydata.org/)  
>Dokumentacja: [https://pandas.pydata.org/pandas-docs/stable/](https://pandas.pydata.org/pandas-docs/stable/)
>
>Podstawowa biblioteka do analizy danych w języku Python.
>
>Aby zainstalować bibliotekę Pandas użyj polecenia poniżej:
```
pip install pandas
```
### Spis treści:
1. [Podstawowe struktury danych: pd.Series](#a1)
2. [Podstawowe struktury danych: pd.DataFrame](#a2)
3. [Selekcja kolumn](#a3)



In [6]:
import pandas as pd
import numpy as np
print(pd.__version__, "\n")
print(np.__version__)

2.2.2 

1.26.4


### <a name='a1'></a>  Podstawowe struktury danych: pd.Series

In [25]:
s = pd.Series(data=[3, 2, 4, 6])
print(s, "\n")
s = pd.Series(data=[3., 2, 4, 6], index=['a', 'b', 'c', 'd'], name="something")
print(s, "\n")
s = pd.Series(data=[3, np.nan, 4, 6], index=['a', 'b', 'c', 'd'], name="something")
print(s, "\n")
s = pd.Series(data=[True, False, False])
print(s, "\n")
s = pd.Series(data=np.arange(15, 20), index=pd.date_range(start='20200101', periods=5))
print(s, "\n")
print(list(s.index))
print(s.dtypes, "\n")
s = pd.Series(data=['python', 'java', 'sql'], name='languages')
print(s)

0    3
1    2
2    4
3    6
dtype: int64 

a    3.0
b    2.0
c    4.0
d    6.0
Name: something, dtype: float64 

a    3.0
b    NaN
c    4.0
d    6.0
Name: something, dtype: float64 

0     True
1    False
2    False
dtype: bool 

2020-01-01    15
2020-01-02    16
2020-01-03    17
2020-01-04    18
2020-01-05    19
Freq: D, dtype: int64 

[Timestamp('2020-01-01 00:00:00'), Timestamp('2020-01-02 00:00:00'), Timestamp('2020-01-03 00:00:00'), Timestamp('2020-01-04 00:00:00'), Timestamp('2020-01-05 00:00:00')]
int64 

0    python
1      java
2       sql
Name: languages, dtype: object


In [26]:
print(type(s), "\n")
print(s.index, "\n")
print(s.values, "\n")
print(s.dtypes, "\n")
print(s.shape)

<class 'pandas.core.series.Series'> 

RangeIndex(start=0, stop=3, step=1) 

['python' 'java' 'sql'] 

object 

(3,)


In [34]:
price = pd.Series(data={'Apple': 200, 'CD Projekt': 60, 'Amazon': 1900, 'KGHM': np.nan})
print(price, "\n")
print(price['CD Projekt'], "\n")
print(price.iloc[1], "\n")
print(price.count(), "\n")
print(price.value_counts(dropna=False), "\n")
print(price.sum(), "\n")
print(price.min(), "\n")
print(price.max(), "\n")
print(price.std(), "\n")
print(price.describe(), "\n")
print(price.nlargest(2), "\n")
print(price.nsmallest(2), "\n")
print(price.rank(), "\n")
print(price.sort_values(), "\n")
print(price.sort_values(ascending=False), "\n")
price_pln = price.apply(lambda x: x * 3.8)
print(price_pln, "\n")
print(price)

Apple          200.0
CD Projekt      60.0
Amazon        1900.0
KGHM             NaN
dtype: float64 

60.0 

60.0 

3 

200.0     1
60.0      1
1900.0    1
NaN       1
Name: count, dtype: int64 

2160.0 

60.0 

1900.0 

1024.3046421841502 

count       3.000000
mean      720.000000
std      1024.304642
min        60.000000
25%       130.000000
50%       200.000000
75%      1050.000000
max      1900.000000
dtype: float64 

Amazon    1900.0
Apple      200.0
dtype: float64 

CD Projekt     60.0
Apple         200.0
dtype: float64 

Apple         2.0
CD Projekt    1.0
Amazon        3.0
KGHM          NaN
dtype: float64 

CD Projekt      60.0
Apple          200.0
Amazon        1900.0
KGHM             NaN
dtype: float64 

Amazon        1900.0
Apple          200.0
CD Projekt      60.0
KGHM             NaN
dtype: float64 

Apple          760.0
CD Projekt     228.0
Amazon        7220.0
KGHM             NaN
dtype: float64 

Apple          200.0
CD Projekt      60.0
Amazon        1900.0
KGHM       

### <a name='a2'></a>  Podstawowe struktury danych: pd.DataFrame


In [39]:
df = pd.DataFrame(data=[12, 12, 32])
print(df, "\n")
df = pd.DataFrame(data=[11, 12, 32], index=['first', 'second', 'third'], columns=['col_1'])
print(df, "\n")
df = pd.DataFrame(data={'WIG20': ['PKN ORLEN', 'PKO BP'],
                        'mWIG40': ['Amica', 'Playway']})
print(df, "\n")
df = pd.DataFrame(data=[[10, 12, 13], [23, 12, 10]], index=['first', 'second'], columns=['col_1', 'col_2', 'col_3'])
print(df)

    0
0  12
1  12
2  32 

        col_1
first      11
second     12
third      32 

       WIG20   mWIG40
0  PKN ORLEN    Amica
1     PKO BP  Playway 

        col_1  col_2  col_3
first      10     12     13
second     23     12     10


In [40]:
print(df.columns, "\n")
print(df.index, "\n")
print(df.values, "\n")
print(df.info(), "\n")
print(df.describe(), "\n")
print(df.describe().T)

Index(['col_1', 'col_2', 'col_3'], dtype='object') 

Index(['first', 'second'], dtype='object') 

[[10 12 13]
 [23 12 10]] 

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, first to second
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col_1   2 non-null      int64
 1   col_2   2 non-null      int64
 2   col_3   2 non-null      int64
dtypes: int64(3)
memory usage: 64.0+ bytes
None 

           col_1  col_2     col_3
count   2.000000    2.0   2.00000
mean   16.500000   12.0  11.50000
std     9.192388    0.0   2.12132
min    10.000000   12.0  10.00000
25%    13.250000   12.0  10.75000
50%    16.500000   12.0  11.50000
75%    19.750000   12.0  12.25000
max    23.000000   12.0  13.00000 

       count  mean       std   min    25%   50%    75%   max
col_1    2.0  16.5  9.192388  10.0  13.25  16.5  19.75  23.0
col_2    2.0  12.0  0.000000  12.0  12.00  12.0  12.00  12.0
col_3    2.0  11.5  2.121320  10.0  10.75  11.5  12.25 

### <a name='a3'> </a> Selekcja kolumn

In [44]:
print(df, "\n")
print(df['col_1'], "\n")
print(type(df['col_1']), "\n")
print(df[['col_1']], "\n")
print(type(df[['col_1']]), "\n")

        col_1  col_2  col_3
first      10     12     13
second     23     12     10 

first     10
second    23
Name: col_1, dtype: int64 

<class 'pandas.core.series.Series'> 

        col_1
first      10
second     23 

<class 'pandas.core.frame.DataFrame'> 



In [45]:
df.columns = ['a', 'sprzedaz_grudzien', 'c']
print(df, "\n")
print(df.sprzedaz_grudzien, "\n")
df['d'] = df.a + df.c
print(df, "\n")

         a  sprzedaz_grudzien   c
first   10                 12  13
second  23                 12  10 

first     12
second    12
Name: sprzedaz_grudzien, dtype: int64 

         a  sprzedaz_grudzien   c   d
first   10                 12  13  23
second  23                 12  10  33 



In [46]:
df = pd.DataFrame(data=[[10, 12, 13], [23, 12, 10]], index=['first', 'second'], columns=['col_1', 'col_2', 'col_3'])
print(df, "\n")
print(df.loc['first'], "\n")
print(df.loc['first', 'col_2'], "\n")
print(df.loc[:, 'col_2'], "\n")
print(df.iloc[0], "\n")
print(df.iloc[0, 1], "\n")

        col_1  col_2  col_3
first      10     12     13
second     23     12     10 

col_1    10
col_2    12
col_3    13
Name: first, dtype: int64 

12 

first     12
second    12
Name: col_2, dtype: int64 

col_1    10
col_2    12
col_3    13
Name: first, dtype: int64 

12 

