<div style="line-height:0.5">
<h1 style="color:lightcoral"> Pandas basics 3 </h1>
</div>
<div style="line-height:1.5">
<div style="margin-top: -8px;">
<span style="display: inline-block;">
    <h3 style="color: lightblue; display: inline;">Keywords:</h3> Operations with Series + Missing values + pd.concat() + clip() 
</span>
</div>
</div>

In [34]:
import math
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer
from scipy.spatial.distance import euclidean

In [35]:
""" Create dataframes """
data = [
    ('circle', 'first', 'S'),
    ('oval', 'second', 'M'),
    ('square', 'third', 'L'),
    ('triangle', 'fourth', 'XL')]

df = pd.DataFrame(data, columns=['Shape', 'Class', 'Size'])

data = np.array([
    ['circle', 'first', 'S'],
    ['oval', 'second', 'M'],
    ['square', 'third', 'L'],
    ['triangle', 'fourth', 'XL']
])

df = pd.DataFrame(data, columns=['Shape', 'Class', 'Size'])

shape = ['circle', 'oval', 'square', 'triangle']
klass = ['first', 'second', 'third', 'fourth']
size = ['S', 'M', 'L', 'XL']

data = []

for i in range(len(shape)):
    data.append({'Shape': shape[i], 'Class': klass[i], 'Size': size[i]})

df = pd.DataFrame(data)

<h2 style="color:lightcoral"> <u> #0 Series </u></h2>

In [5]:
""" Get all items not in common """
serie1 = pd.Series([1,2,3,4,5])
serie2 = pd.Series([6,7,8,4,5])
serie_union = pd.Series(np.union1d(serie1,serie2))
serie_intersection = pd.Series(np.intersect1d(serie1, serie2))
series_miss = serie_union[~serie_union.isin(serie_intersection)]
serie_union, serie_intersection, series_miss

(0    1
 1    2
 2    3
 3    4
 4    5
 5    6
 6    7
 7    8
 dtype: int64,
 0    4
 1    5
 dtype: int64,
 0    1
 1    2
 2    3
 5    6
 6    7
 7    8
 dtype: int64)

In [14]:
""" Get frequency values (sorted in descending order). """
ra = np.random.RandomState(100)
serie3 = pd.Series(np.random.randint(1,15,[12]))

print(ra)
print(serie3)
print(serie3.value_counts())

RandomState(MT19937)
0     10
1      6
2      6
3     12
4      1
5     12
6      8
7      5
8     12
9      2
10     1
11    12
dtype: int64
12    4
6     2
1     2
10    1
8     1
5     1
2     1
Name: count, dtype: int64


In [27]:
""" Get a new Series containing only the values from serie3 that are not the two most common values in serie3. """
res = serie3[~serie3.isin(serie3.value_counts().index[:2])]
res

1    1
2    1
dtype: object

In [26]:
""" Find position of multiple of 3 """
serie4 = pd.Series(np.random.randint(100,200,7))
pos = np.argwhere(serie4 % 3 == 0)

serie4, pos

(0    136
 1    120
 2    128
 3    153
 4    132
 5    183
 6    116
 dtype: int64,
 array([[1],
        [3],
        [4],
        [5]]))

Compute Euclidean Distance between Series values: => 
$ d(\mathbf{a}, \mathbf{b}) = \sqrt{\sum_{i=1}^{n} (a_{i} - b_{i})^2}$

In [35]:
s1 = pd.Series([1,2,3,4,5,6,7,8,9,10])
s2 = pd.Series([10,9,8,7,6,5,4,3,2,1])

#1 
print(np.linalg.norm(s1 -s2))
#2 
print(euclidean(s1, s2))
#3 
print(math.sqrt(sum((x - y) ** 2 for x, y in zip(s1, s2))))
#4 
print(np.sqrt(np.sum((s1 - s2) ** 2)))

18.16590212458495
18.16590212458495
18.16590212458495
18.16590212458495


<h2 style="color:lightcoral"> <u> #1 Handling missing values </u></h2>

In [36]:
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)

df_dropped = df.dropna()
print(df_dropped)

     A    B   C
0  1.0  5.0   9
3  4.0  8.0  12


In [37]:
""" Forward fill missing values """
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)

df_ffill = df.ffill()
print(df_ffill)

     A    B   C
0  1.0  5.0   9
1  2.0  5.0  10
2  2.0  5.0  11
3  4.0  8.0  12


In [38]:
""" Interpolate missing values """
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)

df_interpolated = df.interpolate()
print(df_interpolated)

     A    B   C
0  1.0  5.0   9
1  2.0  6.0  10
2  3.0  7.0  11
3  4.0  8.0  12


In [45]:
""" Hot-deck imputation """
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)

for i in range(len(df)):
    if pd.isnull(df.loc[i,'B']):
        #df.loc[i,'B'] = df.loc[df['A'] == df.loc[i,'A'], 'B'].iloc[0]      #no! leads to IndexError: single positional indexer is out-of-bounds
        b_values = df.loc[df['A'] == df.loc[i,'A'], 'B']
        if b_values.notnull().any():
            df.loc[i,'B'] = b_values.iloc[0]        
print(df)

     A    B   C
0  1.0  5.0   9
1  2.0  NaN  10
2  NaN  NaN  11
3  4.0  8.0  12


In [46]:
""" Hot-deck imputation 2 """
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [9, 10, 11, 12]
}
df = pd.DataFrame(data)

for i in range(len(df)):
    if pd.isnull(df.loc[i,'B']):
        matching_rows = df['A'] == df.loc[i,'A']
        matching_non_missing_rows = df.loc[matching_rows, 'B'].dropna()
        if len(matching_non_missing_rows) > 0:
            df.loc[i,'B'] = matching_non_missing_rows.iloc[0]
        else:
            non_missing_values = df['B'].dropna()
            if len(non_missing_values) > 0:
                df.loc[i,'B'] = non_missing_values.iloc[0]


print(df)

     A    B   C
0  1.0  5.0   9
1  2.0  5.0  10
2  NaN  5.0  11
3  4.0  8.0  12


In [48]:
""" Dataframe from numpy array """
data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
df = pd.DataFrame(data, columns=['A', 'B', 'C'])                  
# Select a subset of the DataFrame
subset = df.loc[0:1, ['B', 'C']]                    
print(df)
print(subset)
# Modify the subset 
subset.loc[0, 'B'] = 10                             

print("----------------------------after")
print(df)
print(subset)

   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9
   B  C
0  2  3
1  5  6
----------------------------after
   A  B  C
0  1  2  3
1  4  5  6
2  7  8  9
    B  C
0  10  3
1   5  6


<h3 style="color:lightcoral"> <u> Example #2 </u></h3>

In [3]:
""" 
# Error in "pressure " column name with end space => change it!
# using sep=', ' take Nan for pressions.
# Lead to the ParserWarning: Falling back to the 'python' engine because the 'c' engine does not support regex separators
#df = pd.read_csv("./data_pandas/dataset_to_try.csv", sep=', ') 
"""

df = pd.read_csv("./data_pandas/dataset_to_try.csv")
df

Unnamed: 0,time_passed,pressure
0,0,76
1,12,86
2,21,88
3,33,99
4,42,54
5,59,77
6,66,78
7,73,76
8,82,12
9,99,13


In [4]:
print(f'type(df["pressure"]) is = {type(df["pressure"])}')
print(f'type(df["time_passed"]) is = {type(df["time_passed"])}')
df.columns

type(df["pressure"]) is = <class 'pandas.core.series.Series'>
type(df["time_passed"]) is = <class 'pandas.core.series.Series'>


Index(['time_passed', 'pressure'], dtype='object')

In [5]:
# Check difference!
print(df.describe)
print()
print(df.describe())

<bound method NDFrame.describe of     time_passed pressure
0             0    0,076
1            12    0,086
2            21    0,088
3            33    0,099
4            42    0,054
5            59    0,077
6            66    0,078
7            73    0,076
8            82     0,12
9            99     0,13
10          120     0,17
11          130     0,18
12          145    0,077
13          177    0,076
14          188    0,095
15          200     0,15
16           42     0,22
17           49     0,27
18           21     0,24
19           33    0,076
20           42    0,095
21           49    0,247
22           33    0,245
23           42    0,455
24           49    0,077
25           33    0,295
26           42    0,222
27           49    0,244
28           33    0,234
29           42    0,266
30           49    0,276
31          100    0,243>

       time_passed
count    32.000000
mean     67.343750
std      51.626206
min       0.000000
25%      33.000000
50%      49.000000
75%   

In [6]:
""" How to face the situation in which I have strings elements instead of float numbers ? 
When I have the comma instead of the dot for floats """
type(df["pressure"])
type(df["time_passed"])

a = df["pressure"][0]
print(type(a))

#df['pressure'] = df['pressure'].astype(float)                                      #ValueError: could not convert string to float: '0,076' since there are non-numeric values
#df['pressure'] = pd.to_numeric(df['pressure'], errors='coerce').astype(float)      #transform all to NaN => useless!
#df['pressure'] = df['pressure'].str.replace(',', '.').astype(float)                #AttributeError: Can only use .str accessor with string values!

df['pressure'] = df['pressure'].astype(str).str.replace(',', '.').astype(float)     #OK!!


a = df["pressure"][0]
print(type(a))

df['pressure'].mean()

<class 'str'>
<class 'numpy.float64'>


0.16678125

In [7]:
df[:14]

Unnamed: 0,time_passed,pressure
0,0,0.076
1,12,0.086
2,21,0.088
3,33,0.099
4,42,0.054
5,59,0.077
6,66,0.078
7,73,0.076
8,82,0.12
9,99,0.13


In [12]:
pressure = df[df.columns[len(df.columns)-1]]
time_pass = df[df.columns[:len(df.columns)-1]]

print(pressure[:5])
print(time_pass[:5])

0    0.076
1    0.086
Name: pressure, dtype: float64
   time_passed
0            0
1           12


<h3 style="color:lightcoral"> <u> Example #3 </u></h3>

<h3 style="color:#A4ED8E"> Recap: </h3>
<div style="margin-top: -20px;">
Interpolation technique can be used for filling in gaps caused by missing values by drawing a line or curve <br>
between the known values bordering the gap and using that line or curve to predict reasonable values.  <br>
Interpolation can be particularly useful when the time intervals between are constant, the data is not prone to noisy fluctuations <br>
and the gaps caused by missing values are small. <br>
</div>

In [2]:
""" Interpolate the missing values using the 'linear' method
By fitting a line starting at 2.0 and ending at 5.0, we can make reasonable guesses \
for the two missing values in between of 3.0 and 4.0.
"""

df = pd.DataFrame({
    'A': [1, 2, np.nan, 4, np.nan],
    'B': [5, np.nan, 7, np.nan, 9]
})
df1 = df.interpolate(method='linear')
#or
df.ffill(inplace = True)
# Print the DataFrame
df2 = df.ffill()

In [3]:
"""If we believe the line between the two known points is nonlinear, 
we can use interpo late’s method to specify the interpolation method"""
df3 = df.interpolate(method='quadratic')
df4 = df.interpolate(limit=1, limit_direction="forward")
print(df)
print()
print(df1)
print()
print(df2)
print()
print(df3)
print()
print(df4)

     A    B
0  1.0  5.0
1  2.0  5.0
2  2.0  7.0
3  4.0  7.0
4  4.0  9.0

     A    B
0  1.0  5.0
1  2.0  6.0
2  3.0  7.0
3  4.0  8.0
4  4.0  9.0

     A    B
0  1.0  5.0
1  2.0  5.0
2  2.0  7.0
3  4.0  7.0
4  4.0  9.0

     A    B
0  1.0  5.0
1  2.0  5.0
2  2.0  7.0
3  4.0  7.0
4  4.0  9.0

     A    B
0  1.0  5.0
1  2.0  5.0
2  2.0  7.0
3  4.0  7.0
4  4.0  9.0


<h2 style="color:lightcoral"> <u> 2 Generate new rows from input rows </u></h2>

In [2]:
range_dict =  {
    'quiet': (0.13, 0.05), 
    'caress': (0.18, 0.01), 
    'pinch': (0.22, 0.23), 
    'pat': (1.27, 0.43), 
    'squeeze': (2.32, 0.50), 
    'hug': (3.23, 0.53), 
    'hit': (4.32, 0.82), 
    'touch': (0.13, 0.25)
    }

In [4]:
num_new_rows = 10

# Load the CSV file into a Pandas DataFrame
df = pd.read_csv('./data_pandas/data_for_third_notebook.csv')

# Define a function to generate new rows from each input row
def create_new_rows(row):
    # Look up the range_mean and range_std values for this row
    response = row["Response"]
    range_mean, range_std = range_dict[response]
    
    # Generate new rows with slightly different values
    new_rows = []
    for i in range(num_new_rows):
        new_row = row.copy()
        val_to_add_1 = np.random.uniform(low=-range_mean, high=range_mean, size=len(new_row.iloc[:-1]))
        value_ok_1 = np.round(val_to_add_1 * 100) / 100    
        new_row.iloc[:-1] += value_ok_1
        val_to_add_2 = np.random.uniform(low=-range_std, high=range_std, size=len(new_row.iloc[15:-1]))
        value_ok_2 = np.round(val_to_add_2 * 100) / 100    
        new_row.iloc[15:-1] += value_ok_2 
        new_rows.append(new_row)
    # Return the new rows as a DataFrame
    return pd.DataFrame(new_rows)

In [5]:
# Apply the function to each row in the DataFrame and concatenate the results
new_df = pd.concat([create_new_rows(row) for _, row in df.iterrows()], ignore_index=True)
# Fix again the precision of the decimal part to 2f 
new_df = new_df.round(2)

# Remove the "Response" column
response_col = new_df.pop("Response")

# Apply the operation to the remaining columns
new_df = new_df.clip(lower=0)
#or 
#new_df = new_df.applymap(lambda x: -x if x < 0 else x)

In [6]:
# Reattach the "Response" column
new_df["Response"] = response_col

# Combine the original DataFrame and the new DataFrame
final_df = pd.concat([df, new_df], ignore_index=True)

# Write the final DataFrame to a new CSV file
final_df.to_csv('./data_pandas/new_dataset.csv', index=False)

In [7]:
final_df.head()

Unnamed: 0,Min0,Min1,Min2,Min3,Min4,Max0,Max1,Max2,Max3,Max4,...,NPlateau31,NPlateau32,NPlateau33,NPlateau34,NPlateau40,NPlateau41,NPlateau42,NPlateau43,NPlateau44,Response
0,21.0,24.0,18.0,21.0,21.0,22.0,25.0,19.0,22.0,22.0,...,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,quiet
1,21.0,24.0,18.0,20.0,20.0,22.0,25.0,19.0,22.0,22.0,...,1.0,1.0,0.0,0.0,8.0,1.0,1.0,0.0,0.0,quiet
2,21.0,24.0,18.0,21.0,21.0,22.0,25.0,19.0,22.0,22.0,...,1.0,0.0,0.0,0.0,12.0,1.0,0.0,0.0,0.0,quiet
3,21.0,24.0,18.0,20.0,20.0,22.0,25.0,20.0,22.0,22.0,...,2.0,0.0,0.0,0.0,9.0,2.0,0.0,0.0,0.0,quiet
4,21.0,24.0,18.0,21.0,21.0,22.0,25.0,19.0,22.0,22.0,...,3.0,0.0,0.0,1.0,6.0,3.0,0.0,0.0,1.0,quiet
