In [4]:
import django
import os
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "kML.settings")
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"
django.setup()

In [5]:
from regml.models import RegData, ColumnTypes, DataOutput, FileMetaData, Dropdown


In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, f_regression
from pandas.api.types import is_numeric_dtype
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as opy
import plotly.express as px
import math
import networkx as nx
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

In [7]:
title = 'boston project'

In [8]:
data_dict = {}
x_cols = []
y_cols = []
col_types = {
            'n': [],
            'c': [],
            'd': [],
            'int': []
}

In [9]:
# Retrieve columns
for row in ColumnTypes.objects.all().filter(project_name=title):
    if row.y:
        y_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)
    else:
        x_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)

In [10]:
# Retrieve observations
for i, row in enumerate(RegData.objects.all().filter(project_name=title)):
    data_dict[i] = row.observations

In [11]:
# Build DF
df = pd.DataFrame.from_dict(data_dict, orient='index')
for col in col_types['n']:
    if not is_numeric_dtype(df[col]):
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError as e:
            col_types['n'].remove(col)
            col_types['c'].append(col)

for col in col_types['c']:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError as e:
        pass
    else:
        col_types['int'].append(col)
col_types['int'].extend(col_types['n'])

In [13]:
for c in col_types['int']:
    Dropdown(col_name=c, project_name=FileMetaData.objects.get(project_name=title)).save()

In [20]:
vars = [row.col_name for row in Dropdown.objects.all().filter(project_name=title)]
Dropdown.objects.values_list('col_name', flat=True)


<QuerySet ['b', 'rm', 'zn', 'age', 'dis', 'nox', 'rad', 'tax', 'chas', 'crim', 'medv', 'indus', 'lstat', 'ptratio', 'b', 'rm', 'zn', 'age', 'dis', 'nox', '...(remaining elements truncated)...']>

In [None]:
# Save Corr Matrix
corr = df[col_types['int']].corr().reset_index()
# checking if exists
existing = DataOutput.objects.filter(project_name=title, output_name='corr_matrix').exists()
if existing:
    DataOutput.objects.filter(project_name=title, output_name='corr_matrix').delete()
# saving corr matrix to plot in java script
DataOutput(output=pd.melt(corr, id_vars='index').to_dict(orient='records'), output_name='corr_matrix',
                   project_name=FileMetaData.objects.get(project_name=title)).save()

In [None]:
fig = px.imshow(corr.set_index('index'), 
               color_continuous_scale=[(0, "#ff9900"), (0.5, 'white'), (1, "#2D3949")],
               )
fig.update_layout(showlegend=False, title_text=f"Feature Correlation Matrix",
                          template="presentation")
fig.update_yaxes(title=None)
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)

fig.show()

In [None]:
#var comes from col_types['int']
var = 'zn'
extract_df = df[[var, y_cols[0]]]
normalized_df = (extract_df - extract_df.mean()) / extract_df.std()
if normalized_df.shape[0] > 1000:
    normalized_df = normalized_df.sample(1000)
target = y_cols[0]    
fig = px.scatter(normalized_df, x=var, y=target)
fig.update_layout(showlegend=False, title_text=f"Scatter plot of {var} and the target {target}",
                          template="presentation")
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='#2D3949')),
                  selector=dict(mode='markers'))
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [None]:
# all plots
normalized_df = (df[col_types['int']] -
                        df[col_types['int']].mean()) / df[col_types['int']].std()
if normalized_df.shape[0] > 500:
    normalized_df = normalized_df.sample(500)
x = y_cols[0]
temp_cols = list(normalized_df.columns)
temp_cols.remove(x)
len(temp_cols)

fig = make_subplots(rows=math.ceil(len(temp_cols) / 2), cols=2, start_cell="bottom-left",
                            subplot_titles=tuple(temp_cols))
rows = 0
for i, y in enumerate(temp_cols):
    if (i + 1) % 2 == 0:
        cols = 2
    else:
        cols = 1
        rows += 1
    fig.add_trace(go.Scatter(x=normalized_df[x], y=normalized_df[y], mode='markers'), row=rows, col=cols)
fig.update_layout(showlegend=False, title_text=f"Linear Relationship of {x} (x axis) and features (y axis)",
                          template="ygridoff")
fig.show()

In [None]:
# f-scores
label_encoder = LabelEncoder()
for col in col_types['c']:
    df[col] = label_encoder.fit_transform(df[col])
new_l = col_types['n'][:]
new_l.extend(col_types['c'])
y = y_cols[0]
new_l.remove(y)
X = df[new_l]
y = df[y]
f_scores = f_regression(X, y, center=True)
p_values = pd.Series(f_scores[1], index=X.columns) \
        .sort_values(ascending=False)

fig = go.Figure([go.Bar(x=p_values.index, y=p_values.values)])
fig.update_traces(marker_color="#ff9900", marker_line_color='#2D3949',
                          marker_line_width=1.5, opacity=0.8)
fig.update_layout(showlegend=False, title_text=f"F-scores - Categorical and Numeric Features",
                          template="presentation")
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [None]:
col_types['int']

In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, f_regression
from pandas.api.types import is_numeric_dtype
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as opy
import plotly.express as px
import math
import networkx as nx
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

In [None]:
title = 'boston project'

In [None]:
data_dict = {}
x_cols = []
y_cols = []
col_types = {
            'n': [],
            'c': [],
            'd': [],
            'int': []
}

In [None]:
# Retrieve columns
for row in ColumnTypes.objects.all().filter(project_name=title):
    if row.y:
        y_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)
    else:
        x_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)

In [None]:
# Retrieve observations
for i, row in enumerate(RegData.objects.all().filter(project_name=title)):
    data_dict[i] = row.observations

In [None]:
# Build DF
df = pd.DataFrame.from_dict(data_dict, orient='index')
for col in col_types['n']:
    if not is_numeric_dtype(df[col]):
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError as e:
            col_types['n'].remove(col)
            col_types['c'].append(col)

for col in col_types['c']:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError as e:
        pass
    else:
        col_types['int'].append(col)
col_types['int'].extend(col_types['n'])

In [None]:
df.head()

In [None]:
# Save Corr Matrix
corr = df[col_types['int']].corr().reset_index()
# checking if exists
existing = DataOutput.objects.filter(project_name=title, output_name='corr_matrix').exists()
if existing:
    DataOutput.objects.filter(project_name=title, output_name='corr_matrix').delete()
# saving corr matrix to plot in java script
DataOutput(output=pd.melt(corr, id_vars='index').to_dict(orient='records'), output_name='corr_matrix',
                   project_name=FileMetaData.objects.get(project_name=title)).save()

In [None]:
fig = px.imshow(corr.set_index('index'), 
               color_continuous_scale=[(0, "#ff9900"), (0.5, 'white'), (1, "#2D3949")],
               )
fig.update_layout(showlegend=False, title_text=f"Feature Correlation Matrix",
                          template="presentation")
fig.update_yaxes(title=None)
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)

fig.show()

In [None]:
#var comes from col_types['int']
var = 'zn'
extract_df = df[[var, y_cols[0]]]
normalized_df = (extract_df - extract_df.mean()) / extract_df.std()
if normalized_df.shape[0] > 1000:
    normalized_df = normalized_df.sample(1000)
target = y_cols[0]    
fig = px.scatter(normalized_df, x=var, y=target)
fig.update_layout(showlegend=False, title_text=f"Scatter plot of {var} and the target {target}",
                          template="presentation")
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='#2D3949')),
                  selector=dict(mode='markers'))
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [None]:
# all plots
normalized_df = (df[col_types['int']] -
                        df[col_types['int']].mean()) / df[col_types['int']].std()
if normalized_df.shape[0] > 500:
    normalized_df = normalized_df.sample(500)
x = y_cols[0]
temp_cols = list(normalized_df.columns)
temp_cols.remove(x)
len(temp_cols)

fig = make_subplots(rows=math.ceil(len(temp_cols) / 2), cols=2, start_cell="bottom-left",
                            subplot_titles=tuple(temp_cols))
rows = 0
for i, y in enumerate(temp_cols):
    if (i + 1) % 2 == 0:
        cols = 2
    else:
        cols = 1
        rows += 1
    fig.add_trace(go.Scatter(x=normalized_df[x], y=normalized_df[y], mode='markers'), row=rows, col=cols)
fig.update_layout(showlegend=False, title_text=f"Linear Relationship of {x} (x axis) and features (y axis)",
                          template="ygridoff")
fig.show()

In [None]:
# f-scores
label_encoder = LabelEncoder()
for col in col_types['c']:
    df[col] = label_encoder.fit_transform(df[col])
new_l = col_types['n'][:]
new_l.extend(col_types['c'])
y = y_cols[0]
new_l.remove(y)
X = df[new_l]
y = df[y]
f_scores = f_regression(X, y, center=True)
p_values = pd.Series(f_scores[1], index=X.columns) \
        .sort_values(ascending=False)

fig = go.Figure([go.Bar(x=p_values.index, y=p_values.values)])
fig.update_traces(marker_color="#ff9900", marker_line_color='#2D3949',
                          marker_line_width=1.5, opacity=0.8)
fig.update_layout(showlegend=False, title_text=f"F-scores - Categorical and Numeric Features",
                          template="presentation")
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [None]:
col_types['int']

ImportError: cannot import name 'Dropdown'

In [6]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import chi2, f_regression
from pandas.api.types import is_numeric_dtype
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.offline as opy
import plotly.express as px
import math
import networkx as nx
from sklearn.linear_model import LinearRegression, RidgeCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, Pipeline

In [7]:
title = 'boston project'

In [8]:
data_dict = {}
x_cols = []
y_cols = []
col_types = {
            'n': [],
            'c': [],
            'd': [],
            'int': []
}

In [14]:
# Retrieve columns
for row in ColumnTypes.objects.all().filter(project_name=title):
    if row.y:
        y_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)
    else:
        x_cols.append(row.col_name)
        col_types[row.col_type].append(row.col_name)

AttributeError: 'RegData' object has no attribute 'col_name'

In [16]:
# Retrieve observations
for i, row in enumerate(RegData.objects.all().filter(project_name=title)):
    data_dict[i] = row.observations

In [17]:
# Build DF
df = pd.DataFrame.from_dict(data_dict, orient='index')
for col in col_types['n']:
    if not is_numeric_dtype(df[col]):
        try:
            df[col] = pd.to_numeric(df[col])
        except ValueError as e:
            col_types['n'].remove(col)
            col_types['c'].append(col)

for col in col_types['c']:
    try:
        df[col] = pd.to_numeric(df[col])
    except ValueError as e:
        pass
    else:
        col_types['int'].append(col)
col_types['int'].extend(col_types['n'])

In [18]:
df.head()

Unnamed: 0,b,rm,zn,age,dis,nox,rad,tax,chas,crim,medv,indus,lstat,ptratio
0,388.22,5.905,0.0,53.2,3.1523,0.583,24,666,0,4.83567,20.6,18.1,11.45,20.2
1,392.68,6.114,0.0,79.8,3.5459,0.583,24,666,0,5.69175,19.1,18.1,14.98,20.2
2,388.62,6.312,0.0,51.9,3.9917,0.583,24,666,0,3.67367,21.2,18.1,10.58,20.2
3,370.73,5.871,0.0,41.9,3.724,0.583,24,666,0,2.37857,20.6,18.1,13.34,20.2
4,392.92,5.762,0.0,40.3,4.0983,0.532,24,666,0,2.81838,21.8,18.1,10.42,20.2


In [21]:
# Save Corr Matrix
corr = df[col_types['int']].corr().reset_index()
# checking if exists
existing = DataOutput.objects.filter(project_name=title, output_name='corr_matrix').exists()
if existing:
    DataOutput.objects.filter(project_name=title, output_name='corr_matrix').delete()
# saving corr matrix to plot in java script
DataOutput(output=pd.melt(corr, id_vars='index').to_dict(orient='records'), output_name='corr_matrix',
                   project_name=FileMetaData.objects.get(project_name=title)).save()

In [113]:
fig = px.imshow(corr.set_index('index'), 
               color_continuous_scale=[(0, "#ff9900"), (0.5, 'white'), (1, "#2D3949")],
               )
fig.update_layout(showlegend=False, title_text=f"Feature Correlation Matrix",
                          template="presentation")
fig.update_yaxes(title=None)
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)

fig.show()

In [108]:
#var comes from col_types['int']
var = 'zn'
extract_df = df[[var, y_cols[0]]]
normalized_df = (extract_df - extract_df.mean()) / extract_df.std()
if normalized_df.shape[0] > 1000:
    normalized_df = normalized_df.sample(1000)
target = y_cols[0]    
fig = px.scatter(normalized_df, x=var, y=target)
fig.update_layout(showlegend=False, title_text=f"Scatter plot of {var} and the target {target}",
                          template="presentation")
fig.update_traces(marker=dict(size=5,
                              line=dict(width=2,
                                        color='#2D3949')),
                  selector=dict(mode='markers'))
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

In [64]:
# all plots
normalized_df = (df[col_types['int']] -
                        df[col_types['int']].mean()) / df[col_types['int']].std()
if normalized_df.shape[0] > 500:
    normalized_df = normalized_df.sample(500)
x = y_cols[0]
temp_cols = list(normalized_df.columns)
temp_cols.remove(x)
len(temp_cols)

fig = make_subplots(rows=math.ceil(len(temp_cols) / 2), cols=2, start_cell="bottom-left",
                            subplot_titles=tuple(temp_cols))
rows = 0
for i, y in enumerate(temp_cols):
    if (i + 1) % 2 == 0:
        cols = 2
    else:
        cols = 1
        rows += 1
    fig.add_trace(go.Scatter(x=normalized_df[x], y=normalized_df[y], mode='markers'), row=rows, col=cols)
fig.update_layout(showlegend=False, title_text=f"Linear Relationship of {x} (x axis) and features (y axis)",
                          template="ygridoff")
fig.show()

In [115]:
# f-scores
label_encoder = LabelEncoder()
for col in col_types['c']:
    df[col] = label_encoder.fit_transform(df[col])
new_l = col_types['n'][:]
new_l.extend(col_types['c'])
y = y_cols[0]
new_l.remove(y)
X = df[new_l]
y = df[y]
f_scores = f_regression(X, y, center=True)
p_values = pd.Series(f_scores[1], index=X.columns) \
        .sort_values(ascending=False)

fig = go.Figure([go.Bar(x=p_values.index, y=p_values.values)])
fig.update_traces(marker_color="#ff9900", marker_line_color='#2D3949',
                          marker_line_width=1.5, opacity=0.8)
fig.update_layout(showlegend=False, title_text=f"F-scores - Categorical and Numeric Features",
                          template="presentation")
fig.update_xaxes(tickangle=45)
fig.update_layout(
    font_family="Gravitas One",
    font_color="#2D3949",
)
fig.show()

['b']