# Issue labeling with the Deep learning Reference Stack (DLRS)

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from glob import glob
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

## Prepare Data

##### Read data from disk. Main data is df, labels we keep are df2

In [None]:
df = pd.DataFrame()
for f_name in glob('/workdir/data/tidy/*.json'):
    df_temp = pd.read_json(f_name, lines=True)
    df = df.append(df_temp)
df.head(3)

##### Use multiLabelBinarizer to create hot encoding of labels for y data

In [None]:
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df["labels"])

##### Prepare X data

In [None]:
vectorizer_X = TfidfVectorizer(analyzer = 'word',
                                       min_df=0.0,
                                       max_df = 1.0,
                                       strip_accents = None,
                                       encoding = 'utf-8', 
                                       preprocessor=None,
                                       token_pattern=r"(?u)\S\S+",
                                       max_features=1000)

X = vectorizer_X.fit_transform(df['body'])

##### Split the data into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

## Define Model

In [None]:
model = Sequential()
model.add(Dense(1000, input_dim=1000))
model.add(Dense(500))
model.add(Dense(10, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

model.fit(X_train,
          y_train,
          validation_data=(X_test, y_test),
          epochs=50,
          batch_size=128,
          verbose=2)


## Test Model

In [None]:
scores = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

## Save Model

In [None]:
model.save("/workdir/models/git-model.h5")

## Test on new data

In [None]:
df_new = pd.DataFrame(["hey, I want a new feature added to this repo."], columns=['body'])
df_new.head()

In [None]:
x_new = vectorizer_X.transform(df_new['body'])
prediction = model.predict(x_new)

print(prediction)
prediction = np.around(prediction)
print(prediction)
print(mlb.classes_)

mlb.inverse_transform(prediction)