# Daily Challenge : Advanced Data Analysis And Problem Solving With SciPy And NumPy

# Dataset creation

In [1]:
import numpy as np
import pandas as pd

# Seed for reproducibility
np.random.seed(0)

# Number of entries
n_entries = 1000

# Simulating dataset
time = np.linspace(0, 100, n_entries)  # Time variable
temperature = 20 + 5 * np.sin(np.pi * time / 50) + np.random.normal(0, 0.5, n_entries)  # Temperature (with some noise)
pressure = 1013 + 20 * np.cos(np.pi * time / 25) + np.random.normal(0, 1, n_entries)  # Pressure (with some noise)
chemical_concentration = 5 + 2 * np.sin(np.pi * time / 10) + np.random.normal(0, 0.2, n_entries)  # Chemical concentration

# Creating DataFrame
ninja_data = pd.DataFrame({
    'Time': time,
    'Temperature': temperature,
    'Pressure': pressure,
    'Chemical Concentration': chemical_concentration
})

In [2]:
df = ninja_data.copy()
df.head()

Unnamed: 0,Time,Temperature,Pressure,Chemical Concentration
0,0.0,20.882026,1033.555963,4.693416
1,0.1001,20.231526,1033.890892,4.72049
2,0.2002,20.552262,1032.571356,5.134934
3,0.3003,21.214783,1033.090475,4.99673
4,0.4004,21.059555,1033.202742,5.234754


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Time                    1000 non-null   float64
 1   Temperature             1000 non-null   float64
 2   Pressure                1000 non-null   float64
 3   Chemical Concentration  1000 non-null   float64
dtypes: float64(4)
memory usage: 31.4 KB


 # 1. Data Transformation:

In [26]:
# normalization
means = df.mean()
stds = df.std()

df_normal = (df-means)/stds

In [27]:
df_normal

Unnamed: 0,Time,Temperature,Pressure,Chemical Concentration
0,-1.729454,0.253054,1.444895,-0.206291
1,-1.725992,0.071093,1.468476,-0.187444
2,-1.722530,0.160811,1.375573,0.101064
3,-1.719067,0.346135,1.412122,0.004856
4,-1.715605,0.302714,1.420026,0.170552
...,...,...,...,...
995,1.715605,0.028892,1.410852,0.121938
996,1.719067,-0.047807,1.503425,-0.250366
997,1.722530,0.001911,1.416461,-0.107164
998,1.725992,-0.162974,1.325244,-0.205458


In [30]:
print(np.mean(df_normal))
print(np.std(df_normal))

-3.907985046680551e-17
Time                      0.9995
Temperature               0.9995
Pressure                  0.9995
Chemical Concentration    0.9995
dtype: float64


  return std(axis=axis, dtype=dtype, out=out, ddof=ddof, **kwargs)


In [32]:
# Logarithmic Scaling 
df_normal.var()

Time                      1.0
Temperature               1.0
Pressure                  1.0
Chemical Concentration    1.0
dtype: float64

In [33]:
df.var()

Time                      835.837506
Temperature                12.780219
Pressure                  201.735150
Chemical Concentration      2.063546
dtype: float64

In [34]:
df['Pressure'] = np.log(df['Pressure'])
df.var()

Time                      835.837506
Temperature                12.780219
Pressure                    0.000197
Chemical Concentration      2.063546
dtype: float64