In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1 Merge Data with Headers

# 1.1 Read in Headers

In [None]:
Header_lst = []

# Read Field_description
f = open("/kaggle/input/protein-localization/field_descriptions.txt", "r")

for x in f:
    # Remove quotation marks
    x = x.replace('"', "")
    # Parse with ":" 
    x = x.split(':')[0]
    # Remove extra spaces
    x = x.strip()
    # Remove tabs in the header
    x = x.replace("\t", "")
    # Append header to our list
    Header_lst.append(x)
Header_lst
len(Header_lst)

## 1.2 Bind headers to Train and Test

### 1.2.1 Read in Train Data with Headers

In [None]:
Train = pd.read_csv("/kaggle/input/protein-localization/train.csv", names = Header_lst)
Train.head()
#Train.to_csv("Train With Headers.csv", index=False)

### 1.2.2 Read in Train Data with Headers

In [None]:
Test_header = Header_lst[:-1]
Test = pd.read_csv("/kaggle/input/protein-localization/test.csv", names = Test_header)
Test.head()
#Test.to_csv("Test With Headers.csv", index=False)

# 2 Preprocess Data

## 2.1 Remove columns with Missing Values

### 2.1.1 Subset Training Data

In [None]:
# missing interactions (in Train): columns 2183 to 2944
# missing functions (in Test): columns 2945 to 2959
# columns 2960 and 2961 hold target variable

const_nonMissing = 2183
const_Interactions = 458

# use only first 2182 columns
Xtrain = Train.iloc[:, :const_nonMissing]

Xtrain = Xtrain.drop("Protein",axis=1)

### 2.1.2 Subset Test Data

In [None]:
# use only first 2182 columns
Xtest = Test.iloc[:, :const_nonMissing]

Xtest = Xtest.drop("Protein",axis=1)

## 2.2 Map Categorical Data to Numeric Data

### 2.2.1 Define Mappings

In [None]:
EssentialMapping = {"Non-Essential": 0, "Ambiguous-Essential": 0.5, "Ambiguous-Non-Essential": 0.5, "Essential": 1}
YesNoMapping = {"Yes": 1, "No": 0}
NanMapping = {"?": np.nan}
TypeMissingMapping = {np.nan : "Unknown"}

1. ### 2.2.2 Apply Mappings to Train

In [None]:
Xtrain = Xtrain.replace({"ESSENTIAL": EssentialMapping})
Xtrain = Xtrain.replace(YesNoMapping)
Xtrain = Xtrain.replace(NanMapping)
Xtrain.iloc[:, const_Interactions::2] = Xtrain.iloc[:, const_Interactions::2].replace(TypeMissingMapping)

### 2.2.3 Apply Mappings to Test

In [None]:
Xtest = Xtest.replace({"ESSENTIAL": EssentialMapping})
Xtest = Xtest.replace(YesNoMapping)
Xtest = Xtest.replace(NanMapping)
Xtest.iloc[:, const_Interactions::2] = Xtest.iloc[:, const_Interactions::2].replace(TypeMissingMapping)

# 2.3 Chromosome

## 2.3.1 Training Data

In [None]:
Xtrain[Xtrain["Chromosome"].isna()]

In [None]:
Xtrain["Chromosome"] = Xtrain["Chromosome"].fillna(Xtrain["Chromosome"].median())
Xtrain

In [None]:
Xtrain[Xtrain["Chromosome"].isna()]

## 2.3.2 Test Data

In [None]:
Xtest[Xtest["Chromosome"].isna()]

In [None]:
# use train median
Xtest["Chromosome"] = Xtest["Chromosome"].fillna(Xtrain["Chromosome"].median())
Xtest

In [None]:
Xtest[Xtest["Chromosome"].isna()]

## 2.4 Interacting Proteins

## 2.4.1 One Hot Encoding

#### 2.4.1.1 Training Data

In [None]:
train_ip_type = Xtrain.iloc[:, const_Interactions::2]
train_ip_type

In [None]:
# add rows to ensure all types are accounted for in columns
gRow = ["Genetic"] * len(train_ip_type.columns)
pRow = ["Physical"] * len(train_ip_type.columns)
gpRow = ["Genetic-Physical"] * len(train_ip_type.columns)
uRow = ["Unknown"] * len(train_ip_type.columns)
train_ip_type.loc[len(train_ip_type)+1] = gRow
train_ip_type.loc[len(train_ip_type)+1] = pRow
train_ip_type.loc[len(train_ip_type)+1] = gpRow
train_ip_type.loc[len(train_ip_type)+1] = uRow

train_ip_type = pd.get_dummies(train_ip_type, drop_first=True)
train_ip_type

In [None]:
train_ip_type = train_ip_type.drop([len(train_ip_type), len(train_ip_type)-1, 
                    len(train_ip_type)-2, len(train_ip_type)-3],
                  axis=0)
train_ip_type

In [None]:
X_train = pd.concat((Xtrain.iloc[:,: const_Interactions], train_ip_type), axis=1)
X_train

#### 2.4.1.2 Test Data

In [None]:
test_ip_type = Xtest.iloc[:, const_Interactions::2]
test_ip_type

In [None]:
test_ip_type.loc[len(test_ip_type)+1] = gRow
test_ip_type.loc[len(test_ip_type)+1] = pRow
test_ip_type.loc[len(test_ip_type)+1] = gpRow
test_ip_type.loc[len(test_ip_type)+1] = uRow

test_ip_type = pd.get_dummies(test_ip_type, drop_first=True)
test_ip_type

In [None]:
test_ip_type = test_ip_type.drop([len(test_ip_type), len(test_ip_type)-1, 
                    len(test_ip_type)-2, len(test_ip_type)-3],
                  axis=0)
test_ip_type

In [None]:
X_test = pd.concat((Xtest.iloc[:,: const_Interactions], test_ip_type), axis=1)
X_test

## 2.4.2 Correlations

#### 2.4.2.1 Training Data

In [None]:
train_ip_corr = Xtrain.iloc[:, const_Interactions+1::2]
train_ip_corr

In [None]:
train_ip_corr[train_ip_corr.isnull().any(axis=1)]

In [None]:
train_ip_corr = train_ip_corr.fillna(train_ip_corr.median())
train_ip_corr[train_ip_corr.isnull().any(axis=1)]

In [None]:
X_train = pd.concat((X_train, train_ip_corr), axis=1)
X_train

#### 2.4.2.2 Test Data

In [None]:
test_ip_corr = Xtest.iloc[:, const_Interactions+1::2]
test_ip_corr

In [None]:
test_ip_corr[test_ip_corr.isnull().any(axis=1)]

In [None]:
# note - still using the train median
test_ip_corr = test_ip_corr.fillna(train_ip_corr.median())
test_ip_corr[test_ip_corr.isnull().any(axis=1)]

In [None]:
X_test = pd.concat((X_test, test_ip_corr), axis=1)
X_test

# 2.5 Impute Essential with KNN

In [None]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

### 2.5.1 Training Data

In [None]:
imputer = KNNImputer(n_neighbors=2)
X_train["ESSENTIAL"] = pd.DataFrame(imputer.fit_transform(X_train), 
                                    columns = X_train.columns)["ESSENTIAL"]

### 2.5.2 Test Data

In [None]:
X_test["ESSENTIAL"] = pd.DataFrame(imputer.fit_transform(X_test), 
                                    columns = X_test.columns)["ESSENTIAL"]

## 2.6 Clean All Data

In [None]:
X_train = X_train.apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(pd.to_numeric, errors='coerce')

In [None]:
y_train = Train["Label"]

In [None]:
X_train.to_csv("X_train.csv", index = False)
y_train.to_csv("y_train.csv", index = False)
X_test.to_csv("X_test.csv", index = False)

# 3 Submit Function

In [None]:
def generateSubmit(p, filename):
    submission = pd.concat([Test["Protein"],  p[0]], axis=1)
    submission.columns = ["Key", "Label"]
    submission.head()
    submission.to_csv(filename, index=False)

# 4 Logistic Regression

## 4.1 Fit Model

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegression = LogisticRegression()
logisticRegression.fit(X_train, y_train)

## 4.2 Cross-Validation

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(logisticRegression, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

# 4.3 Tune

In [None]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.simplefilter('ignore')

params = {'penalty': ['l2', 'l1' 'elastic net', 'none'],
          'C':[1, 1.1, 1.2],
          'solver': ['liblinear']}
        

lr = LogisticRegression()

lr_grid = GridSearchCV(lr,params,cv = 5,n_jobs=-1,scoring='accuracy')
lr_grid.fit(X_train, y_train)
lr_grid.best_params_

In [None]:
lr_tuned = LogisticRegression(C=1.1, class_weight=None, penalty='l2', solver='liblinear')
cv = KFold(n_splits=10, random_state=1, shuffle=True)
scores = cross_val_score(lr_tuned, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1)
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

In [None]:
lr_tuned.fit(X_train, y_train)

# 4.4 Get Predictions

In [None]:
predictions = pd.DataFrame(lr_tuned.predict(X_test))
predictions

In [None]:
generateSubmit(predictions, "tuned_LR.csv")