<div style="line-height:1.2;">

<h1 style="color:#D68910; margin-bottom: 0.2em;">Scipy basics 2</h1>
<h4 style="margin-top: 0.2em;">Example of Statistics.</h4>

</div>

<div style="margin-top: 10px;">
<span style="display: inline-block;">
    <h3 style="color: lightblue; display: inline; margin-bottom: 0;">Keywords:</h3> ne-sample t-test + Two-sample t-test + Pearson correlation coefficient + Spearman rank-order correlation coefficient +  p-value + Kendall rank correlation coefficient 
</span>
</div>

In [6]:
import numpy as np
import pandas as pd
from scipy import stats

In [7]:
""" Read csv """
df_squeeze = pd.read_csv("./data/squeeze_acquisitions.csv", header=None, skiprows=None, sep=';')
df_squeeze.head()
df_squeeze.columns = ['front_left_pressure_voltage', 'front_right_pressure_voltage', 'right_pressure_voltage', 'back_pressure_voltage', 'left_pressure_voltage', 
                'gyroscope_1_voltage', 'gyroscope_2_voltage']

df_squeeze.iloc[:, :5] = df_squeeze.iloc[:, :5].astype(float)

col_dtype = df_squeeze['gyroscope_1_voltage'].dtype

print(col_dtype)
df_squeeze.head()

int64


Unnamed: 0,front_left_pressure_voltage,front_right_pressure_voltage,right_pressure_voltage,back_pressure_voltage,left_pressure_voltage,gyroscope_1_voltage,gyroscope_2_voltage
0,0.42,0.43,0.38,0.28,0.3,35,-136
1,0.49,0.16,0.47,0.07,0.38,243,151
2,0.26,0.21,0.25,0.38,0.45,-62,-206
3,0.58,0.23,0.29,0.04,0.54,78,29
4,0.22,0.18,0.61,0.03,0.48,52,-153


In [8]:
random_numbers = np.random.randint(low=0, high=101, size=len(df_hug)) #array of 1370 random integers between 0 and 100
df_temp = pd.DataFrame({'numerical': random_numbers})
print(df_temp.head(10))

   numerical
0         59
1         86
2         55
3         29
4         13
5          0
6        100
7         65
8          5
9          4


In [9]:
X = pd.DataFrame()

X['pressure_mean'] = df_squeeze[['front_left_pressure_voltage', 'front_right_pressure_voltage', 
                            'right_pressure_voltage', 'back_pressure_voltage', 'left_pressure_voltage']].mean(axis=1)

X['gyroscope_sum'] = df_squeeze['gyroscope_1_voltage'] + df_squeeze['gyroscope_2_voltage']

# Create a new "category" column based on a numerical column
X['category'] = pd.cut(df_temp['numerical'], bins=[0, 25, 50, 75, 100], labels=['low', 'medium', 'high', 'very high'])

print("df_squeeze.columns ==> {}".format(df_squeeze.columns))
print()
print("X.columns ==> {}".format(X.columns))
print(X.head(10))

df_squeeze.columns ==> Index(['front_left_pressure_voltage', 'front_right_pressure_voltage',
       'right_pressure_voltage', 'back_pressure_voltage',
       'left_pressure_voltage', 'gyroscope_1_voltage', 'gyroscope_2_voltage'],
      dtype='object')

X.columns ==> Index(['pressure_mean', 'gyroscope_sum', 'category'], dtype='object')
   pressure_mean  gyroscope_sum   category
0          0.362           -101       high
1          0.314            394  very high
2          0.310           -268       high
3          0.336            107     medium
4          0.304           -101        low
5          0.326            551        NaN
6          0.282            -89  very high
7          0.300            537       high
8          0.298            -91        low
9          0.296            252        low


In [10]:
# Get a descriptive statistics of the dataset
desc_stats = X.describe()
print(desc_stats)
print()

# One-sample t-test to test if the population mean of 'pressure_mean' is equal to 0.4
t_stat, p_value = stats.ttest_1samp(X['pressure_mean'], 0.4)
print('One-sample t-test: t = {:.3f}, p = {:.3f}'.format(t_stat, p_value))

### Calc the (two-sampled) t-test to check if the population means of 'gyroscope_sum' are equal for 'high' and 'very high' categories
high_gyro = X.loc[X['category'] == 'high', 'gyroscope_sum']
very_high_gyro = X.loc[X['category'] == 'very high', 'gyroscope_sum']
t_stat, p_value = stats.ttest_ind(high_gyro, very_high_gyro)
print('Two-sample t-test: t = {:.3f}, p = {:.3f}'.format(t_stat, p_value))

# Calc the Pearson correlation coefficient and p-value between 'pressure_mean' and 'gyroscope_sum'
corr_coef, p_value = stats.pearsonr(X['pressure_mean'], X['gyroscope_sum'])
print('Pearson correlation: r = {:.3f}, p = {:.3f}'.format(corr_coef, p_value))

# Calc the Spearman rank-order correlation coefficient and p-value between 'pressure_mean' and 'gyroscope_sum'
corr_coef, p_value = stats.spearmanr(X['pressure_mean'], X['gyroscope_sum'])
print('Spearman correlation: rho = {:.3f}, p = {:.3f}'.format(corr_coef, p_value))

# Compute the Kendall rank correlation coefficient and p-value between 'pressure_mean' and 'gyroscope_sum'
corr_coef, p_value = stats.kendalltau(X['pressure_mean'], X['gyroscope_sum'])
print('Kendall correlation: tau = {:.3f}, p = {:.3f}'.format(corr_coef, p_value))

       pressure_mean  gyroscope_sum
count    1370.000000    1370.000000
mean        0.210266      54.925547
std         0.038589     222.339175
min         0.086000    -405.000000
25%         0.182000     -97.000000
50%         0.208000     -91.000000
75%         0.240000     268.750000
max         0.362000     609.000000

One-sample t-test: t = -181.986, p = 0.000
Two-sample t-test: t = -0.131, p = 0.896
Pearson correlation: r = -0.005, p = 0.864
Spearman correlation: rho = -0.043, p = 0.111
Kendall correlation: tau = -0.031, p = 0.094


In [11]:
X['category'].value_counts()

very high    368
high         343
low          333
medium       317
Name: category, dtype: int64

In [12]:
missing_values = X['category'].isnull().sum()
print('Missing values:', missing_values)

Missing values: 9


In [13]:
""" Fill the missing values with a random category """
available_categories = X['category'].dropna().unique()

In [14]:
observed_total = X['category'].value_counts().sum()
expected_freqs = pd.Series({'low': 1370/4, 'medium': 1370/4, 'high': 1370/4, 'very high': 1370/4})
expected_total = expected_freqs.sum()

print('Observed total:', observed_total)
print('Expected total:', expected_total)

Observed total: 1361
Expected total: 1370.0
