# Getting Start

In [20]:
import pandas as pd

## Pandas Basics

### pd.Series( )

In [21]:
basic_dataset = [1,2,3]
basic_mix_dataset = ["A","B","C",1,2,3,{"A1","B1","C1"}]
basic_multicol_dataset = {"name": ["Allan", "Beatriz", "Clara"], "age": [22,21,24]}
basic_dict = {"day_1": 344, "day_2": 423, "day_3": 314}

# pandas.Series(data, index, dtype, name, copy)
    # one-dimensional (column), holding any data type
        # if label isn't specified, values are labed with their index
            # using labels you could access an item referring them, for create use: pd.Series(data, index = [x,y,z])
my_series = pd.Series(basic_dict)
first_day = my_series["day_1"]

### pd.DataFrame( )

In [22]:
basic_dataset = [1,2,3]
basic_mix_dataset = ["A","B","C",1,2,3,{"A1","B1","C1"}]
basic_multicol_dataset = {"name": ["Allan", "Beatriz", "Clara"], "age": [22,21,24]}
basic_dict = {"day_1": 344, "day_2": 423, "day_3": 314}


# pandas.DataFrame(data, index, columns, dtype, copy)
    # two-dimensional (rows, columns) 
my_dataframe = pd.DataFrame(basic_multicol_dataset)
    
    # for access one-more specified row, use: .loc
my_dataframe.loc[1]     # one row, returns pd.Series(column) 
my_dataframe.loc[[0,2]] # multi rows, returns pd.DataFrame (rows, columns), for this you need to put [ [] ]
    
    # using "index", you can name the rows
my_dataframe = pd.DataFrame(basic_multicol_dataset, index=["Firts", "Second", "Third"])

### pd.read_csv( )

In [23]:
# for load data set stored as csv, use:
    # it will load as DataFrame
load_csv = pd.read_csv("data/data.csv") # specify file path

    # use ".to_string()" for print the entire DataFrame, with many rows it only will return first and last 5 rows
        # to check how many rows the system's display, use: "pd.options.display.max_rows" = [60]
print(load_csv.to_string())




     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.0
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

### pd.read_json( )

In [24]:
# for load data set stored as json, use:
    # JSON is plain text, but has the format of an object
load_json = pd.read_json("data/data.json")

    # for print the entire, uses ".to_string()"
print(load_json.to_string())

    # JSON have the same structure as Python Dicts, for this, you can load a Python dict directly as .DataFrame()
python_dict = {
  "Duration":{
    "0":60,
    "1":60,
    "2":60,
    "3":45,
    "4":45,
    "5":60
  },
  "Pulse":{
    "0":110,
    "1":117,
    "2":103,
    "3":109,
    "4":117,
    "5":102
  }
}
 
loading_pythondict_directly = pd.DataFrame(python_dict)

     Duration  Pulse  Maxpulse  Calories
0          60    110       130     409.1
1          60    117       145     479.0
2          60    103       135     340.0
3          45    109       175     282.4
4          45    117       148     406.0
5          60    102       127     300.5
6          60    110       136     374.0
7          45    104       134     253.3
8          30    109       133     195.1
9          60     98       124     269.0
10         60    103       147     329.3
11         60    100       120     250.7
12         60    106       128     345.3
13         60    104       132     379.3
14         60     98       123     275.0
15         60     98       120     215.2
16         60    100       120     300.0
17         45     90       112       NaN
18         60    103       123     323.0
19         45     97       125     243.0
20         60    108       131     364.2
21         45    100       119     282.0
22         60    130       101     300.0
23         45   

### Analyzing DataFrames

In [25]:
# for get a quicky overview of DataFrame, uses .head():
    # this method returns the headers and the specified number of rows, if you dont specify, returns the top 5 rows
load_json.head(5)

    # for vizualize the last N rows, use .tail()
load_json.tail(5)

    # for see more information about the data set, uses: .info()
        # tells how many rows and columns have, with the data type, tells how many Null values are present
load_json.info()


<class 'pandas.core.frame.DataFrame'>
Index: 169 entries, 0 to 168
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  169 non-null    int64  
 1   Pulse     169 non-null    int64  
 2   Maxpulse  169 non-null    int64  
 3   Calories  164 non-null    float64
dtypes: float64(1), int64(3)
memory usage: 6.6 KB


## Cleaning Data

### Cleaning Empty Cells

In [26]:
bad_dataset = pd.read_csv("data/bad_data.csv")

# for remove empty cells, use: .dropna( )
    # by default, this method returns a new DataFrame, not change the original
        # for change the original, uses the argument: "inplace = True"
bad_dataset.info() # before .dropna()

bad_dataset.dropna(inplace=True)

bad_dataset.info() # after .dropna()

# for replace NULL values with a specific one, uses .fillna()
bad_dataset.fillna(130, inplace=True)

# replace with MEAN MEDIAN and MODE
mean = bad_dataset["Calories"].mean()
median = bad_dataset["Calories"].median()
mode = bad_dataset["Calories"].mode
    # after this, just uses .fillna(mean, inplace = )


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 29 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  29 non-null     int64  
 1   Date      29 non-null     object 
 2   Pulse     29 non-null     int64  
 3   Maxpulse  29 non-null     int64  
 4   Calories  29 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


### Cleaning Wrong Format

In [27]:
# the column "Date" has two cells with the wrong format ["NaN", "20201226"]
    # for fix his, pandas has: to_datetime(), if have problemas, uses format = "mixed" as arg
bad_dataset = pd.read_csv('data/bad_data.csv')

bad_dataset["Date"] = pd.to_datetime(bad_dataset['Date'], format = "mixed")

print(bad_dataset.to_string())

# the NULL value still there, for remove empty cells from a specific column, uses: .dropna(subset["COLUMN"])
bad_dataset.dropna(subset=["Date"], inplace=True)

    Duration       Date  Pulse  Maxpulse  Calories
0         60 2020-12-01    110       130     409.1
1         60 2020-12-02    117       145     479.0
2         60 2020-12-03    103       135     340.0
3         45 2020-12-04    109       175     282.4
4         45 2020-12-05    117       148     406.0
5         60 2020-12-06    102       127     300.0
6         60 2020-12-07    110       136     374.0
7        450 2020-12-08    104       134     253.3
8         30 2020-12-09    109       133     195.1
9         60 2020-12-10     98       124     269.0
10        60 2020-12-11    103       147     329.3
11        60 2020-12-12    100       120     250.7
12        60 2020-12-12    100       120     250.7
13        60 2020-12-13    106       128     345.3
14        60 2020-12-14    104       132     379.3
15        60 2020-12-15     98       123     275.0
16        60 2020-12-16     98       120     215.2
17        60 2020-12-17    100       120     300.0
18        45 2020-12-18     90 

### Cleaning Wrong Data

In [28]:
# in row 7, the duration is 450, but for all the other rows the duration is between 30 and 60

dataframe = pd.read_csv("data/bad_data.csv")

# .loc [{position}, "{Column}"] = {value}
dataframe.loc[7, "Duration"] = 45 

# for big data sets, you can use a loop
for x in dataframe.index:
    if dataframe.loc[x, "Duration"] > 120:
        dataframe.drop(x, inplace=True)


### Removing Duplacates

In [29]:
duplicate_data = pd.read_csv("data/bad_data.csv")

# returns a Bool values for each row:
print(duplicate_data.duplicated())

# for remove uses:
duplicate_data.drop_duplicates(inplace=True)

print(duplicate_data.duplicated())

0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
12     True
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
dtype: bool
0     False
1     False
2     False
3     False
4     False
5     False
6     False
7     False
8     False
9     False
10    False
11    False
13    False
14    False
15    False
16    False
17    False
18    False
19    False
20    False
21    False
22    False
23    False
24    False
25    False
26    False
27    False
28    False
29    False
30    False
31    False
dtype: bool


## Correlations

### Pandas Correlations

In [None]:
df = pd.read_csv("data/data.csv")

# corr() method calculates the relationship between each column in your data set, this method ignores "not numeric"
df.corr()

# 1 means that there is a 1 to 1 relationship (a perfect correlation)

# 0.9 is also a good relationship, and if you increase one value, the other will probably increase as well.

# -0.9 would be just as good relationship as 0.9, but if you increase one value, the other will probably go down.

# 0.2 means NOT a good relationship, meaning that if one value goes up does not mean that the other will.

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
Duration,1.0,-0.155408,0.009403,0.922717
Pulse,-0.155408,1.0,0.786535,0.025121
Maxpulse,0.009403,0.786535,1.0,0.203813
Calories,0.922717,0.025121,0.203813,1.0
