In [22]:
# import modules and functions
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_validate, KFold, GridSearchCV

In [23]:
# Use processed Switzerland Data
heart_file = 'heart/heart+disease/processed.switzerland.data'
twitter_file = 'twitter/twitter/twitter.csv'
#heart_name_list = ['age', 'sex','cp','trestbps','chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak','slope','ca','thal','num']

# Read heart disease and twitter data files
heart_data = pd.read_csv(heart_file)
twitter_data = pd.read_csv(twitter_file)
#print(heart_data)

# Select features and target
# Heart disease sort features
X_heart = heart_data[['trestbps', 'chol']].dropna() # dropna() drops the NaN values because they are NULL.
y_heart = heart_data.loc[X_heart.index, ['age']] # Use X as main source for y.

# Twitter features
X_twitter = twitter_data[['longitude', 'latitude']]
y_twitter = twitter_data[['timestamp']]

In [24]:
# 80% train data
# Train test split for heart disease
random_val = 14
test_val = 0.2

# 80% train data
X_heart_train, X_heart_test, y_heart_train, y_heart_test = train_test_split(X_heart, y_heart, test_size=test_val, random_state=random_val)

# 20% train data, 80% validation
X_heart_train, X_heart_val, y_heart_train, y_heart_val = train_test_split(X_heart_train, y_heart_train, test_size=test_val/(1-test_val), random_state=random_val)

# Train test split for twitter data
X_twitter_train, X_twitter_test, y_twitter_train, y_twitter_test = train_test_split(X_twitter, y_twitter, test_size=test_val, random_state=random_val)
X_twitter_train, X_twitter_val, y_twitter_train, y_twitter_val = train_test_split(X_twitter_train, y_twitter_train, test_size=test_val/(1-test_val), random_state=random_val)

#X_heart_train = np.ravel(X_heart_train)
y_heart_train = np.ravel(y_heart_train)
#X_twitter_train = np.ravel(X_twitter_train)
y_twitter_train = np.ravel(y_twitter_train)


In [25]:
# K Neighbors amount
k = 7

# Model for heart disease
HeartMod = LinearRegression() # Linear model init
HeartMod.fit(X_heart_train, y_heart_train)

# K Neighbors for Heart Disease
HeartKn = KNeighborsClassifier(n_neighbors=k) # K Neighbors init

# Model for Twitter data
TwitterMod = LinearRegression() # Linear model init
TwitterMod.fit(X_twitter_train, y_twitter_train)

# K Neighbors for Twitter
TwitterKn = KNeighborsClassifier(n_neighbors=k) # K Neighbors init


In [26]:
# Cross validation
cross_index = 5

# K Tuning Grid
k_tuning = {'n_neighbors': [5, 7, 8, 9]}

# k fold
kf = KFold(n_splits=cross_index, shuffle=True, random_state=random_val)

# Tuning for Heart Disease
HeartKnTune = GridSearchCV(HeartKn, k_tuning, cv=kf)
HeartKnTune.fit(X_heart_train, y_heart_train)

# Tuning for Twitter data
TwitterKnTune = GridSearchCV(TwitterKn, k_tuning, cv=kf)
TwitterKnTune.fit(X_twitter_train, y_twitter_train)

In [27]:
# cross validate heart disease
HeartMod_score = cross_validate(HeartMod, X_heart_train, y_heart_train, cv=cross_index)
HeartK_score = cross_validate(HeartKn, X_heart_train, y_heart_train, cv=kf)
HeartK_tune_score = HeartKnTune.cv_results_['mean_test_score']

# Cross validate twitter data
TwitterMod_score = cross_validate(TwitterMod, X_twitter_train, y_twitter_train, cv=cross_index)
TwitterK_score = cross_validate(TwitterKn, X_twitter_train, y_twitter_train, cv=kf)
TwitterK_tune_score = TwitterKnTune.cv_results_['mean_test_score']

In [28]:
print("Linear Regression Scores-------------")
print("Heart Linear Regression Score: ", HeartMod_score)
print("Twitter Linear Regression Score: ", TwitterMod_score)


print("K Nearest Neighbors Scores-------------")
print("Heart K Nearest Neighbors Score: ", HeartK_score)
print("Twitter K Nearest Neighbors Score: ", TwitterK_score)

print("Tuning Grid Scores-------------")
print("Heart Tuning Score: ", HeartK_tune_score)
print("Twitter Tuning Score: ", TwitterK_tune_score)

Linear Regression Scores-------------
Heart Linear Regression Score:  {'fit_time': array([0.00436258, 0.00084472, 0.00081897, 0.00074601, 0.00080872]), 'score_time': array([0.00083017, 0.00101161, 0.00058317, 0.00059104, 0.00061631]), 'test_score': array([ 0.15720827, -0.52165375,  0.05400316,  0.21498426,  0.17413072])}
Twitter Linear Regression Score:  {'fit_time': array([0.27997494, 0.2762568 , 0.29662156, 0.2833848 , 0.26684642]), 'score_time': array([0.02142859, 0.02307701, 0.0244329 , 0.02170944, 0.02049398]), 'test_score': array([2.50116945e-05, 1.03366412e-05, 1.63651676e-05, 2.87955430e-05,
       2.53904634e-05])}
K Nearest Neighbors Scores-------------
Heart K Nearest Neighbors Score:  {'fit_time': array([0.00186396, 0.00103045, 0.00115037, 0.0012641 , 0.00083756]), 'score_time': array([0.00189829, 0.00180888, 0.00172734, 0.00141311, 0.00173831]), 'test_score': array([0.        , 0.        , 0.        , 0.07142857, 0.        ])}
Twitter K Nearest Neighbors Score:  {'fit_time