In [3]:
from sklearn.datasets import make_blobs, make_moons, make_regression, make_classification
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

from scipy.stats import pearsonr, spearmanr

from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score, explained_variance_score

# from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import f1_score, roc_curve, roc_auc_score



# clustering
# from sklearn.datasets import make_blobs, make_moons
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.cluster import KMeans

# from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score

# from scipy.stats import pearsonr, spearmanr

# data exploration
# from sklearn.datasets import make_regression, make_moons
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.neural_network import MLPRegressor
# from sklearn.ensemble import RandomForestRegressor

# from sklearn.metrics import r2_score, explained_variance_score

# from scipy.stats import pearsonr, spearmanr

# classification
# from sklearn.datasets import make_classification, make_moons
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LogisticRegression
# from sklearn.neural_network import MLPClassifier
# from sklearn.ensemble import RandomForestClassifier

# from sklearn.metrics import f1_score, roc_curve, roc_auc_score

# from scipy.stats import pearsonr, spearmanr

# regression

# from sklearn.datasets import make_regression, make_moons
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pandas as pd
# import numpy as np

# from sklearn.model_selection import train_test_split
# from sklearn.linear_model import LinearRegression
# from sklearn.neural_network import MLPRegressor
# from sklearn.ensemble import RandomForestRegressor

# from sklearn.metrics import r2_score, explained_variance_score

# from scipy.stats import pearsonr, spearmanr

In [4]:
def interval_transform(x, a, b):
    m = x.min()
    ma = x.max()
    
    alpha_inv = (1 - m/ma)*ma/(a - b)
    alpha = 1/alpha_inv
    beta = b - alpha*m
    
    f = lambda x: alpha*x + beta 
    return f(x)

def make_noise_feature(x):
    n_features = x.shape[1]
    n_samples = x.shape[0]
    
    weights = np.random.uniform(1e-4, 1e-2, n_features)
    noise = np.random.normal(1, 5, n_samples)
    signal = np.sum(weights*x, -1) 
    return signal + noise

def calculate_pvalues(df,
                      method = spearmanr
                     ):
    """
    Assumes df with only numeric entries clean of null entries. 
    """
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            pvalues[r][c] = round(method(df[r], df[c])[1], 4)
    return pvalues

def correlation_matrix(df,
                       method = "pearson",
                       annot_bool = False,
                       annot_size = 20
                      ):
    # Compute the correlation matrix
    corr = df.corr(method = method)

    if annot_bool:
        annot = corr.copy()
        
        if method == "pearson":
            sig_meth = pearsonr
        else:
            sig_meth = spearmanr
            
        pval = calculate_pvalues(df, sig_meth) 
        # create three masks
        r0 = corr.applymap(lambda x: '{:.2f}'.format(x))
        r1 = corr.applymap(lambda x: '{:.2f}*'.format(x))
        r2 = corr.applymap(lambda x: '{:.2f}**'.format(x))
        r3 = corr.applymap(lambda x: '{:.2f}***'.format(x))
  
        # apply them where appropriate --this could be a single liner
        annot = annot.where(pval>0.1,r0)
        annot = annot.where(pval<=0.1,r1)
        annot = annot.where(pval<=0.05,r2)
        annot = annot.mask(pval<=0.01,r3)

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr, dtype=np.bool)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    f, ax = plt.subplots(figsize=(11, 11))

    # Generate a custom diverging colormap
    cmap = sns.diverging_palette(220, 10, as_cmap=True)

    # Draw the heatmap with the mask and correct aspect ratio
    sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
                square=True, linewidths=.5, cbar_kws={"shrink": .5},
                annot = annot,
                fmt = "",
                annot_kws={"size": annot_size},
                vmin = -1,
                vmax = 1,
               )

In [5]:
n_info = 3
n_redu = 0
n_samples=2000

#making nonlinear decision boundaries requires multiple blob like features 
X1, y1 = make_blobs(
        n_samples=n_samples,
        n_features=2,
        centers=np.array([[42, 39], [39.5, 38.3]]),
        shuffle=False,
        random_state=42, 
        #difficulty,
        cluster_std=1.4,
        )

X2, y2 = make_blobs(
        n_samples=n_samples,
        n_features=2,
        centers=np.array([[44, 39.8], [38, 37.9]]),
        cluster_std=1.2,
        shuffle=False,
        random_state=6, 
        #difficulty,
    
        )

X3, y3 = make_moons(n_samples=2*n_samples, noise=1, random_state=42)

X = np.concatenate([X1, X2], axis=0)
y = np.concatenate([y1, y2], axis=0)

data = np.concatenate([X, np.expand_dims(y, -1)], -1)
data = pd.DataFrame(data)

In [6]:
X.shape

(4000, 2)

In [8]:
gpa_column = interval_transform(data[0], 1, 4) 
passed_column = interval_transform(data[1], 0, 100)


In [9]:
full_data = np.concatenate(
                [
                    np.expand_dims(gpa_column, axis=-1),
                    np.expand_dims(passed_column, axis=-1),
                    np.expand_dims(y, axis=-1)
                ],
                axis=1
)

columns = [              
            "cGPA",
            "passed_percent",
            "degree",
        ]

df_full = pd.DataFrame(full_data,
                       columns=columns)

In [17]:
df_full.to_csv('/home/john/research/tutorials/clustering/data/clustering_data.csv', index=False)

In [16]:
pwd

'/home/john/research/tutorials/data-creation'

In [18]:
n_info = 3
n_redu = 0
n_samples=2000

#making nonlinear decision boundaries requires multiple blob like features 
X1, y1 = make_regression(
        n_samples=n_samples,
        n_features=3,
        n_informative=n_info,
        #n_redundant=n_redu,
        shuffle=False,
        random_state=42,
        #difficulty
        effective_rank=2,
        noise=0.6,
        tail_strength=0.2,
        bias=12,
        )

X2, y2 = make_regression(
        n_samples=n_samples,
        n_features=3,
        n_informative=n_info,
        #n_redundant=n_redu,
        shuffle=False,
        random_state=6, 
        #difficulty
        effective_rank=1,
        noise=1.1,
        tail_strength=0.3,
        bias=10,
        )

#X3, y3 = make_moons(n_samples=2*n_samples, noise=1, random_state=42)

X = np.concatenate([X1, X2], axis=0)
y = np.concatenate([y1, y2], axis=0)

data = np.concatenate([X, np.expand_dims(y, -1)], -1)
data = pd.DataFrame(data)

In [20]:
attendance_column = interval_transform(data[2], 0, 100)
gpa_column = interval_transform(data[1], 1, 4) 
passed_column = interval_transform(data[0], 0, 100)
sex_column = make_noise_feature(X)
sex_column = (sex_column > sex_column.mean()).astype(int)
hsgpa_column = interval_transform(make_noise_feature(X), 0, 4)
ethn_column = make_noise_feature(X)
ethn_column = pd.qcut(ethn_column, q=[0, .25, .5, 1], labels=[0, 1, 2])
fci_post = interval_transform(y, 0, 30)


In [21]:
full_data = np.concatenate(
                [
                    np.expand_dims(gpa_column, axis=-1),
                    np.expand_dims(attendance_column, axis=-1),
                    np.expand_dims(passed_column, axis=-1),
                    np.expand_dims(sex_column, axis=-1),
                    np.expand_dims(hsgpa_column, axis=-1),
                    np.expand_dims(ethn_column, axis=-1),
                    np.expand_dims(fci_post, axis=-1)
                ],
                axis=1
)

columns = [              
           "cGPA",
           "attendance",
           "passed_percent",
           "sex",
           "hsGPA",
           "ethnicity",
           "fci_post"]

df_full = pd.DataFrame(full_data,
                       columns=columns)

In [24]:
df_full.to_csv('/home/john/research/tutorials/exploring-data/data/regression_data.csv', index=False)

In [25]:
n_info = 3
n_redu = 0
n_samples=2000

#making nonlinear decision boundaries requires multiple blob like features 
X1, y1 = make_regression(
        n_samples=n_samples,
        n_features=3,
        n_informative=n_info,
        #n_redundant=n_redu,
        shuffle=False,
        random_state=42,
        #difficulty
        effective_rank=2,
        noise=0.6,
        tail_strength=0.2,
        bias=12,
        )

X2, y2 = make_regression(
        n_samples=n_samples,
        n_features=3,
        n_informative=n_info,
        #n_redundant=n_redu,
        shuffle=False,
        random_state=6, 
        #difficulty
        effective_rank=1,
        noise=1.1,
        tail_strength=0.3,
        bias=10,
        )

#X3, y3 = make_moons(n_samples=2*n_samples, noise=1, random_state=42)

X = np.concatenate([X1, X2], axis=0)
y = np.concatenate([y1, y2], axis=0)

data = np.concatenate([X, np.expand_dims(y, -1)], -1)
data = pd.DataFrame(data)

In [26]:
attendance_column = interval_transform(data[2], 0, 100)
gpa_column = interval_transform(data[1], 1, 4) 
passed_column = interval_transform(data[0], 0, 100)
sex_column = make_noise_feature(X)
sex_column = (sex_column > sex_column.mean()).astype(int)
hsgpa_column = interval_transform(make_noise_feature(X), 0, 4)
ethn_column = make_noise_feature(X)
ethn_column = pd.qcut(ethn_column, q=[0, .25, .5, 1], labels=[0, 1, 2])
fci_post = interval_transform(y, 0, 30)


In [27]:
full_data = np.concatenate(
                [
                    np.expand_dims(gpa_column, axis=-1),
                    np.expand_dims(attendance_column, axis=-1),
                    np.expand_dims(passed_column, axis=-1),
                    np.expand_dims(sex_column, axis=-1),
                    np.expand_dims(hsgpa_column, axis=-1),
                    np.expand_dims(ethn_column, axis=-1),
                    np.expand_dims(fci_post, axis=-1)
                ],
                axis=1
)

columns = [              
           "cGPA",
           "attendance",
           "passed_percent",
           "sex",
           "hsGPA",
           "ethnicity",
           "fci_post"]

df_full = pd.DataFrame(full_data,
                       columns=columns)

In [28]:
df_full.to_csv('/home/john/research/tutorials/regression/data/regression_data.csv', index=False)