# Statistical Testing

General imports

In [54]:
import pandas as pd
import numpy as np

Load the dataset

In [55]:
df = pd.read_csv("JO_pivoted.csv")
name = df.columns[0]
df = df.drop(name, axis=1)
df

Unnamed: 0,region,year,barley,energy forest,fallow land,"field peas for cooking, fodder peas, vetches and field beans",green fodder,green peas,horticulture plants,ley for hay and forage plants,...,triticale,unspecified arable land,utilized ley for hay,utilized ley for hay and pasture,utilized pasture,white beans,winter barley,winter rape,winter turnip rape,winter wheat
0,0114 Upplands Väsby,1981,500.0,0.0,179.0,0.0,43.0,0.0,0.0,0.0,...,0.0,0.0,0.0,229.0,0.0,0.0,0.0,0.0,0.0,80.0
1,0114 Upplands Väsby,1985,586.0,0.0,30.0,11.0,63.0,0.0,0.0,0.0,...,0.0,0.0,0.0,201.0,0.0,0.0,0.0,0.0,0.0,40.0
2,0114 Upplands Väsby,1989,264.0,0.0,124.0,22.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,275.0,0.0,0.0,0.0,0.0,14.0,477.0
3,0114 Upplands Väsby,1990,213.0,0.0,57.0,38.0,2.0,0.0,0.0,0.0,...,0.0,0.0,0.0,213.0,0.0,0.0,0.0,0.0,2.0,520.0
4,0114 Upplands Väsby,1991,328.0,0.0,91.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,217.0,0.0,0.0,0.0,0.0,6.0,180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4055,2584 Kiruna,1999,0.0,0.0,17.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,272.0,0.0,0.0,0.0,0.0,0.0,0.0
4056,2584 Kiruna,2001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,109.0,0.0,151.0,0.0,0.0,0.0,0.0,0.0,0.0
4057,2584 Kiruna,2002,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,...,0.0,90.0,0.0,140.0,0.0,0.0,0.0,0.0,0.0,0.0
4058,2584 Kiruna,2003,0.0,0.0,15.0,0.0,0.0,0.0,0.0,0.0,...,0.0,69.0,0.0,143.0,0.0,0.0,0.0,0.0,0.0,0.0


### Normal Distribution findings

Find all features that have a normal distribution

In [56]:
import numpy as np
from scipy.stats import shapiro, normaltest, anderson

Drop categorical

In [57]:
df = df.drop("region", axis=1)

Check for normal distribution using Shapiro-Wilk

In [58]:
normal = []
not_normal = []
for feature in df.columns:
    stat, p = shapiro(df[feature])
    if p > 0.05:
        #print(f"{feature} Probably normal")
        normal.append((feature, p))
    else:
        #print(f"{feature} Probably not normal")
        not_normal.append((feature, p))

normal_df = pd.DataFrame(normal, columns=['Feature', 'p-value'])
normal_df['Normality'] = 'Normal'

not_normal_df = pd.DataFrame(not_normal, columns=['Feature', 'p-value'])
not_normal_df['Normality'] = 'Not Normal'

result_df = pd.concat([normal_df, not_normal_df], ignore_index=True)
result_df.sort_values(by='Feature').reset_index(drop=True)

  result_df = pd.concat([normal_df, not_normal_df], ignore_index=True)


Unnamed: 0,Feature,p-value,Normality
0,barley,5.132168e-77,Not Normal
1,energy forest,1.608787e-84,Not Normal
2,fallow land,2.883799e-71,Not Normal
3,"field peas for cooking, fodder peas, vetches a...",4.943592e-76,Not Normal
4,green fodder,9.97751e-69,Not Normal
5,green peas,6.576868e-87,Not Normal
6,horticulture plants,3.4654449999999995e-85,Not Normal
7,ley for hay and forage plants,1.25405e-79,Not Normal
8,ley for seeds,1.6948600000000002e-75,Not Normal
9,mixed grain,3.5200430000000003e-75,Not Normal


Let's see if normaltest agrees

In [59]:
normal = []
not_normal = []
for feature in df.columns:
    stat, p = normaltest(df[feature])
    if p > 0.05:
        #print(f"{feature} Probably normal")
        normal.append((feature, p))
    else:
        #print(f"{feature} Probably not normal")
        not_normal.append((feature, p))

normal_df = pd.DataFrame(normal, columns=['Feature', 'p-value'])
normal_df['Normality'] = 'Normal'

not_normal_df = pd.DataFrame(not_normal, columns=['Feature', 'p-value'])
not_normal_df['Normality'] = 'Not Normal'

result_df = pd.concat([normal_df, not_normal_df], ignore_index=True)
result_df.sort_values(by='Feature').reset_index(drop=True)

  result_df = pd.concat([normal_df, not_normal_df], ignore_index=True)


Unnamed: 0,Feature,p-value,Normality
0,barley,0.0,Not Normal
1,energy forest,0.0,Not Normal
2,fallow land,0.0,Not Normal
3,"field peas for cooking, fodder peas, vetches a...",0.0,Not Normal
4,green fodder,0.0,Not Normal
5,green peas,0.0,Not Normal
6,horticulture plants,0.0,Not Normal
7,ley for hay and forage plants,0.0,Not Normal
8,ley for seeds,0.0,Not Normal
9,mixed grain,0.0,Not Normal


And finally let's see if mr. anderson agrees

In [60]:
normal = []
not_normal = []
for feature in df.columns:
    data = df[feature].dropna()
    result = anderson(data)
    
    sig_level_index = result.significance_level.tolist().index(5.0)
    crit_val = result.critical_values[sig_level_index]

    if result.statistic < crit_val:
        normal.append((feature, result.statistic))
    else:
        not_normal.append((feature, result.statistic))

normal_df = pd.DataFrame(normal, columns=['Feature', 'Statistic'])
normal_df['Normality'] = 'Normal'

not_normal_df = pd.DataFrame(not_normal, columns=['Feature', 'Statistic'])
not_normal_df['Normality'] = 'Not Normal'

result_df = pd.concat([normal_df, not_normal_df], ignore_index=True)
result_df.sort_values(by='Feature').reset_index(drop=True)

  result_df = pd.concat([normal_df, not_normal_df], ignore_index=True)


Unnamed: 0,Feature,Statistic,Normality
0,barley,752.701899,Not Normal
1,energy forest,1158.663589,Not Normal
2,fallow land,516.048673,Not Normal
3,"field peas for cooking, fodder peas, vetches a...",721.554819,Not Normal
4,green fodder,420.589592,Not Normal
5,green peas,1431.113924,Not Normal
6,horticulture plants,1164.656658,Not Normal
7,ley for hay and forage plants,965.371542,Not Normal
8,ley for seeds,732.508311,Not Normal
9,mixed grain,606.736967,Not Normal


## Feature Correlation

Spearman rank correlation

In [61]:
spearman = df.corr(method='spearman')
spearman

Unnamed: 0,year,barley,energy forest,fallow land,"field peas for cooking, fodder peas, vetches and field beans",green fodder,green peas,horticulture plants,ley for hay and forage plants,ley for seeds,...,triticale,unspecified arable land,utilized ley for hay,utilized ley for hay and pasture,utilized pasture,white beans,winter barley,winter rape,winter turnip rape,winter wheat
year,1.0,-0.790997,0.48108,0.24166,-0.071282,-0.082596,0.228504,0.645841,0.062942,-0.212417,...,0.570369,0.713141,-0.024782,-0.018526,0.061068,0.057402,0.407175,-0.060755,-0.216498,0.020371
barley,-0.790997,1.0,-0.34935,0.031347,0.286486,0.4456,-0.163749,-0.466578,-0.397576,0.321267,...,-0.584031,-0.462947,0.030515,0.427148,-0.395622,-0.042792,-0.535922,0.208117,0.322782,0.170826
energy forest,0.48108,-0.34935,1.0,0.468512,0.369349,0.186682,0.42884,0.677004,-0.224848,0.174076,...,0.478212,0.489971,-0.009868,0.280993,-0.225175,-0.002059,0.335208,0.181883,0.054007,0.344368
fallow land,0.24166,0.031347,0.468512,1.0,0.668699,0.360319,0.235499,0.415639,-0.052641,0.53999,...,0.440207,0.27332,0.010965,0.471998,-0.047851,0.076034,0.298359,0.408417,0.401223,0.627421
"field peas for cooking, fodder peas, vetches and field beans",-0.071282,0.286486,0.369349,0.668699,1.0,0.253875,0.220362,0.221237,-0.110606,0.616398,...,0.331394,0.040496,0.019375,0.362046,-0.095171,0.071123,0.242301,0.675159,0.558812,0.861439
green fodder,-0.082596,0.4456,0.186682,0.360319,0.253875,1.0,0.061161,0.276429,-0.62711,0.176933,...,-0.079519,0.272588,0.011322,0.893407,-0.625966,0.063216,-0.206784,0.123884,0.085243,0.094017
green peas,0.228504,-0.163749,0.42884,0.235499,0.220362,0.061161,1.0,0.394712,-0.105392,0.207548,...,0.262215,0.209773,-0.004625,0.061322,-0.105545,-0.011344,0.245503,0.281075,-0.019541,0.265398
horticulture plants,0.645841,-0.466578,0.677004,0.415639,0.221237,0.276429,0.394712,1.0,-0.300298,-0.000169,...,0.534069,0.672059,-0.013179,0.35849,-0.300735,0.111455,0.35779,0.09907,-0.076319,0.206404
ley for hay and forage plants,0.062942,-0.397576,-0.224848,-0.052641,-0.110606,-0.62711,-0.105392,-0.300298,1.0,0.092163,...,0.286105,-0.297961,-0.01123,-0.67021,0.995735,-0.027542,0.458539,0.049301,0.013486,0.033253
ley for seeds,-0.212417,0.321267,0.174076,0.53999,0.616398,0.176933,0.207548,-0.000169,0.092163,1.0,...,0.139302,-0.151699,0.007473,0.213279,0.099073,-0.013387,0.1761,0.50065,0.434218,0.594848


Extract and display most correlated features

In [62]:
spearman = spearman.abs()

np.fill_diagonal(spearman.values, 0)

correlations = spearman.unstack().reset_index()
correlations.columns = ['Feature 1', 'Feature 2', 'Correlation']
correlations = correlations[correlations['Feature 1'] < correlations['Feature 2']]

high_correlation = correlations[(correlations['Correlation'] > 0.4) & (correlations['Correlation'] < 1.0)]
high_correlation = high_correlation.sort_values(by='Correlation', ascending=False).reset_index(drop=True)
high_correlation[:10]

Unnamed: 0,Feature 1,Feature 2,Correlation
0,ley for hay and forage plants,utilized pasture,0.995735
1,green fodder,utilized ley for hay and pasture,0.893407
2,spring wheat,winter wheat,0.891366
3,rye,winter wheat,0.873932
4,"field peas for cooking, fodder peas, vetches a...",winter wheat,0.861439
5,spring rape,winter wheat,0.851253
6,rye,spring wheat,0.840786
7,"field peas for cooking, fodder peas, vetches a...",spring wheat,0.835334
8,spring rape,spring wheat,0.834856
9,oats,total arable land,0.824736
