### Import Libraries 

In [13]:
%%capture

!pip install tensorflow

In [15]:
%%capture

import pandas as pd
import numpy as np
import re, string
import xgboost as xgb
from sklearn.model_selection import train_test_split
import gensim.utils
from gensim.models import word2vec, Word2Vec
from joblib import dump, load

### Import Data

In [16]:
data = pd.read_csv('/content/drive/MyDrive/DataMining/sampled_data_5000_no_code.csv')
data.head()

Unnamed: 0,id,title,text,full_text_no_code,labels
0,36154834,What's the function of ExpiresAbsolute?,<p>I'm trying to port a really old and poorly ...,whats the function of expiresabsolute im tryin...,html
1,30891002,Select Attribute value and field value Using j...,<p>I have multiple input field like this</p> ...,select attribute value and field value using j...,"html,jquery"
2,8198499,jquery effects not working,"<p>If you go to <a href=""http://summer-festiva...",jquery effects not working if you go to this p...,"javascript,jquery"
3,16553766,Floating divs around and under an image,<p>I am trying to design a page with a floatin...,floating divs around and under an image i am t...,css
4,19486782,How to align text boxes,<p>I want to align following form all text box...,how to align text boxes i want to align follow...,css


### Add label columns

In [17]:
columns = ['html', 'css', 'javascript', 'jquery']

for column in columns:
    data.loc[:, column] = data.labels.str.contains(column)

for column in columns:
    data.loc[:, column] = data[column].replace({False: 0, True: 1}).astype(int)
    
data.head()

Unnamed: 0,id,title,text,full_text_no_code,labels,html,css,javascript,jquery
0,36154834,What's the function of ExpiresAbsolute?,<p>I'm trying to port a really old and poorly ...,whats the function of expiresabsolute im tryin...,html,1,0,0,0
1,30891002,Select Attribute value and field value Using j...,<p>I have multiple input field like this</p> ...,select attribute value and field value using j...,"html,jquery",1,0,0,1
2,8198499,jquery effects not working,"<p>If you go to <a href=""http://summer-festiva...",jquery effects not working if you go to this p...,"javascript,jquery",0,0,1,1
3,16553766,Floating divs around and under an image,<p>I am trying to design a page with a floatin...,floating divs around and under an image i am t...,css,0,1,0,0
4,19486782,How to align text boxes,<p>I want to align following form all text box...,how to align text boxes i want to align follow...,css,0,1,0,0


### Prepare Data

In [18]:
# Split full text to list
data['full_text_no_code_list'] = data['full_text_no_code'].apply(lambda x: gensim.utils.simple_preprocess(x))
data.head()

Unnamed: 0,id,title,text,full_text_no_code,labels,html,css,javascript,jquery,full_text_no_code_list
0,36154834,What's the function of ExpiresAbsolute?,<p>I'm trying to port a really old and poorly ...,whats the function of expiresabsolute im tryin...,html,1,0,0,0,"[whats, the, function, of, expiresabsolute, im..."
1,30891002,Select Attribute value and field value Using j...,<p>I have multiple input field like this</p> ...,select attribute value and field value using j...,"html,jquery",1,0,0,1,"[select, attribute, value, and, field, value, ..."
2,8198499,jquery effects not working,"<p>If you go to <a href=""http://summer-festiva...",jquery effects not working if you go to this p...,"javascript,jquery",0,0,1,1,"[jquery, effects, not, working, if, you, go, t..."
3,16553766,Floating divs around and under an image,<p>I am trying to design a page with a floatin...,floating divs around and under an image i am t...,css,0,1,0,0,"[floating, divs, around, and, under, an, image..."
4,19486782,How to align text boxes,<p>I want to align following form all text box...,how to align text boxes i want to align follow...,css,0,1,0,0,"[how, to, align, text, boxes, want, to, align,..."


In [19]:
# Split data

train_df = data.sample(frac = 0.8)
test_df = data.drop(train_df.index)

In [20]:
# Define Train and Test pd.series

x_train = train_df['full_text_no_code_list']
x_test = test_df['full_text_no_code_list']

### Build Word2Vec model

In [None]:
num_features = 300    
min_word_count = 3    
num_workers = 4       
context = 8           
downsampling = 1e-3   

# Initialize and train the model
W2Vmodel = Word2Vec(sentences=x_train, sg=1, hs=0, workers=num_workers, vector_size=num_features, min_count=min_word_count, window=context,
                    sample=downsampling, negative=5, epochs=6)

In [None]:
# Save model
W2Vmodel.save("/content/drive/MyDrive/DataMining/word2vec.model")

In [21]:
# Load model
W2Vmodel = Word2Vec.load("/content/drive/MyDrive/DataMining/word2vec.model")

In [22]:
# Test model

W2Vmodel.wv.most_similar('html', topn=10)

[('hmtl', 0.6132817268371582),
 ('hai', 0.590995728969574),
 ('htmlphp', 0.5895947813987732),
 ('pdflib', 0.5698714852333069),
 ('domtree', 0.569623589515686),
 ('labelinput', 0.5675471425056458),
 ('datatxt', 0.5675139427185059),
 ('pagejsp', 0.5661720633506775),
 ('phphtml', 0.5644363164901733),
 ('gsp', 0.5633955001831055)]

In [23]:
# Convert the Train and Test series to arrays of array

words = set(W2Vmodel.wv.index_to_key)
X_train_vect = np.array([np.array([W2Vmodel.wv[i] for i in ls if i in words])
                         for ls in x_train])
X_test_vect = np.array([np.array([W2Vmodel.wv[i] for i in ls if i in words])
                         for ls in x_test])

  X_train_vect = np.array([np.array([W2Vmodel.wv[i] for i in ls if i in words])
  X_test_vect = np.array([np.array([W2Vmodel.wv[i] for i in ls if i in words])


In [24]:
# We currently have a vector of size 300 for every words
# We create a vector of size 300 for each documents by calculateing the mean of the n vectors of each word
# n = the number of words in every document 

X_train_vect_avg = []
for v in X_train_vect:
    if v.size:
        X_train_vect_avg.append(v.mean(axis=0))
    else:
        X_train_vect_avg.append(np.zeros(300, dtype=float))
X_test_vect_avg = []
for v in X_test_vect:
    if v.size:
        X_test_vect_avg.append(v.mean(axis=0))
    else:
        X_test_vect_avg.append(np.zeros(300, dtype=float))

### F1 Calculation Function

In [25]:
def calculate_f1(dataframe):

  dataframe['score'] = 0

  for index, row in dataframe.iterrows():
      common_labels = 0
      all_labels = 0

      for label in ['html','css','javascript','jquery']:
          filtered_columns = row.filter(like=label).index.tolist()
          
          true_label = row[filtered_columns[0]]
          predicted_label = row[filtered_columns[1]]
          
          if true_label == 1 and predicted_label == 1:
              common_labels += 1

          all_labels += true_label
          all_labels += predicted_label

      dataframe.at[index, 'score'] = 2 * common_labels / all_labels

  f1_score = dataframe['score'].sum() / 15000

  return f1_score

### Random Forest Classifier

In [40]:
from sklearn.ensemble import RandomForestClassifier

RandomForest = pd.DataFrame()
load_model = False

for label in ['html','css','javascript','jquery']:

  y_train = train_df[label]
  y_test = test_df[label]

  if not load_model:
    rf = RandomForestClassifier()
    rf_model = rf.fit(X_train_vect_avg, y_train.values.ravel())

    # Save model
    dump(rf_model, f'/content/drive/MyDrive/DataMining/{label}_random_forest.joblib')
  else:
    # load model
    rf_model = load(f'/content/drive/MyDrive/DataMining/{label}_random_forest.joblib')

  y_pred = rf_model.predict(X_test_vect_avg)

  RandomForest[f'{label}_real'] = np.array(y_test)
  RandomForest[f'{label}_pred'] = y_pred

In [41]:
RandomForest.head()

Unnamed: 0,html_real,html_pred,css_real,css_pred,javascript_real,javascript_pred,jquery_real,jquery_pred
0,0,0,1,1,1,1,0,0
1,0,0,1,1,0,0,1,1
2,1,1,1,1,0,0,0,0
3,0,0,0,0,0,0,1,1
4,1,1,1,1,0,0,1,1


In [42]:
RandomForest_score = calculate_f1(RandomForest)
RandomForest_score

0.7115650793650794

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

LogisticRegression_df = pd.DataFrame()
load_model = False

for label in ['html','css','javascript','jquery']:

  y_train = train_df[label]
  y_test = test_df[label]

  if not load_model:
    clf = LogisticRegression(solver='lbfgs', max_iter=1000)
    clf.fit(X_train_vect_avg, y_train)

    # Save model
    dump(clf, f'/content/drive/MyDrive/DataMining/{label}_logistic_regression.joblib')
  else:
    # load model
    clf = load(f'/content/drive/MyDrive/DataMining/{label}_logistic_regression.joblib')

  y_pred = clf.predict(X_test_vect_avg)

  LogisticRegression_df[f'{label}_real'] = np.array(y_test)
  LogisticRegression_df[f'{label}_pred'] = y_pred

LogisticRegression_df.head()

In [48]:
LogisticRegression_score = calculate_f1(LogisticRegression_df)
LogisticRegression_score

0.751652380952381

### Simple NN Model

In [26]:
import tensorflow as tf
from tensorflow import keras

simpleNN = pd.DataFrame()
load_model = True

for label in ['html','css','javascript','jquery']:

  y_train = train_df[label]
  y_test = test_df[label]

  if not load_model:
    model = keras.models.Sequential([
      keras.layers.Reshape((300, 1, 1), input_shape=(300,)),
      keras.layers.Conv2D(64, (3, 1), activation='relu'),
      keras.layers.MaxPooling2D((2, 1)),
      keras.layers.Flatten(),
      keras.layers.Dense(128, activation='relu'),
      keras.layers.Dense(15, activation='softmax')
  ])
    
    model.compile(optimizer='adam',
                loss='sparse_categorical_crossentropy',
                metrics=['accuracy'])

    model.fit(np.array(X_train_vect_avg), np.array(y_train), epochs=5, batch_size=128)
    model.save(f'/content/drive/MyDrive/DataMining/{label}_model.keras')
  else:
    model = tf.keras.models.load_model(f'/content/drive/MyDrive/DataMining/{label}_model.keras')

  # Make predictions on the test dataset
  predictions = model.predict(np.array(X_test_vect_avg))

  # Assuming you want to get the predicted class labels
  predicted_labels = np.argmax(predictions, axis=1)

  simpleNN[f'{label}_real'] = np.array(y_test)
  simpleNN[f'{label}_pred'] = predicted_labels




In [None]:
tf.keras.utils.plot_model(model, to_file='nn.jpg', show_shapes=True)

In [27]:
simpleNN_score = calculate_f1(simpleNN)
simpleNN_score

0.7418914285714284

### RNN

In [38]:
import tensorflow as tf
from tensorflow import keras

RNN = pd.DataFrame()
load_model = False

for label in ['html','css','javascript','jquery']:

  y_train = train_df[label]
  y_test = test_df[label]

  if not load_model:
    rnn_model = keras.models.Sequential([
      keras.layers.Reshape((300, 1), input_shape=(300,)),
      keras.layers.Conv1D(64, 3, activation='relu'),
      keras.layers.MaxPooling1D(2),
      keras.layers.Reshape((149, 64)),  # Reshape for RNN input
      keras.layers.SimpleRNN(32),  # RNN layer
      keras.layers.Flatten(),
      keras.layers.Dense(128, activation='relu'),
      keras.layers.Dense(15, activation='softmax')
  ])
    
    rnn_model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

    rnn_model.fit(np.array(X_train_vect_avg), np.array(y_train), epochs=5, batch_size=128)
    rnn_model.save(f'/content/drive/MyDrive/DataMining/{label}_rnn_model.keras')
  else:
    rnn_model = tf.keras.models.load_model(f'/content/drive/MyDrive/DataMining/{label}_rnn_model.keras')

  # Make predictions on the test dataset
  predictions = rnn_model.predict(np.array(X_test_vect_avg))

  # Assuming you want to get the predicted class labels
  predicted_labels = np.argmax(predictions, axis=1)

  RNN[f'{label}_real'] = np.array(y_test)
  RNN[f'{label}_pred'] = predicted_labels


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
tf.keras.utils.plot_model(rnn_model, to_file='rnn.jpg', show_shapes=True)

In [39]:
RNN_score = calculate_f1(RNN)
RNN_score

0.709088253968254

### Chain Classifier

In [28]:
from sklearn.datasets import make_multilabel_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain

In [29]:
train_labels = train_df[['html','css','javascript','jquery']].values
test_labels = test_df[['html','css','javascript','jquery']].values

#### Chain Classifier + Logistic Regression

In [34]:
load_model = True

base_lr = LogisticRegression(solver='lbfgs', random_state=0)

if not load_model:
  chain = ClassifierChain(base_lr, order='random', random_state=0)
  chain.fit(np.array(X_train_vect_avg), train_labels)
  dump(chain, '/content/drive/MyDrive/DataMining/LogisticRegression_Chain.joblib')

else: 

  chain = load('/content/drive/MyDrive/DataMining/LogisticRegression_Chain.joblib') 

In [35]:
predicted_values = chain.predict(np.array(X_test_vect_avg))

predicted_values_df = pd.DataFrame(predicted_values)
predicted_values_df = predicted_values_df.rename(columns={0:'html_pred',1:'css_pred',2:'javascript_pred',3:'jquery_pred'})

In [36]:
true_values_df = test_df[['html','css','javascript','jquery']]

predicted_values_df.reset_index(drop=True,inplace=True)
true_values_df.reset_index(drop=True,inplace=True)

chain_logregression = pd.concat([predicted_values_df, true_values_df], axis=1)
chain_logregression.head()

Unnamed: 0,html_pred,css_pred,javascript_pred,jquery_pred,html,css,javascript,jquery
0,0.0,1.0,1.0,0.0,0,1,1,0
1,0.0,1.0,0.0,1.0,0,1,0,1
2,1.0,1.0,0.0,0.0,1,1,0,0
3,0.0,0.0,0.0,1.0,0,0,0,1
4,1.0,1.0,0.0,1.0,1,1,0,1


In [37]:
chain_logregression_score = calculate_f1(chain_logregression)
chain_logregression_score

0.7551101587301589

#### Chain Classifier + XGBoost

In [30]:
load_model = True

xgb_classifier = xgb.XGBClassifier()

if not load_model:
  chain = ClassifierChain(xgb_classifier, order='random', random_state=0)
  chain.fit(np.array(X_train_vect_avg), train_labels)
  dump(chain, '/content/drive/MyDrive/DataMining/XGBClassifier.joblib')

else: 
  
  chain = load('/content/drive/MyDrive/DataMining/XGBClassifier.joblib') 

In [31]:
predicted_values = chain.predict(np.array(X_test_vect_avg))

predicted_values_df = pd.DataFrame(predicted_values)
predicted_values_df = predicted_values_df.rename(columns={0:'html_pred',1:'css_pred',2:'javascript_pred',3:'jquery_pred'})

In [32]:
true_values_df = test_df[['html','css','javascript','jquery']]

predicted_values_df.reset_index(drop=True,inplace=True)
true_values_df.reset_index(drop=True,inplace=True)

chain_xgboost = pd.concat([predicted_values_df, true_values_df], axis=1)
chain_xgboost.head()

Unnamed: 0,html_pred,css_pred,javascript_pred,jquery_pred,html,css,javascript,jquery
0,0.0,1.0,1.0,0.0,0,1,1,0
1,0.0,1.0,0.0,1.0,0,1,0,1
2,1.0,1.0,0.0,0.0,1,1,0,0
3,0.0,0.0,0.0,1.0,0,0,0,1
4,1.0,1.0,0.0,1.0,1,1,0,1


In [33]:
chain_xgboost_score = calculate_f1(chain_xgboost)
chain_xgboost_score

0.8777463492063493

### K-NN

In [None]:
train_labels = train_df[['html','css','javascript','jquery']].values
test_labels = test_df[['html','css','javascript','jquery']].values

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.multioutput import ClassifierChain
knn_clf=KNeighborsClassifier(n_neighbors=15,weights='distance',p=1)
knn_clf.fit(np.array(X_train_vect_avg), train_labels)


In [None]:
predicted_values = knn_clf.predict(np.array(X_test_vect_avg))

predicted_values_df = pd.DataFrame(predicted_values)
predicted_values_df = predicted_values_df.rename(columns={0:'html_pred',1:'css_pred',2:'javascript_pred',3:'jquery_pred'})

In [None]:
true_values_df = test_df[['html','css','javascript','jquery']]

predicted_values_df.reset_index(drop=True,inplace=True)
true_values_df.reset_index(drop=True,inplace=True)

knn_df = pd.concat([predicted_values_df, true_values_df], axis=1)
knn_df

Unnamed: 0,html_pred,css_pred,javascript_pred,jquery_pred,html,css,javascript,jquery
0,1,1,0,0,0,1,0,0
1,0,1,1,0,1,0,1,0
2,1,1,0,0,1,1,1,0
3,0,0,0,1,0,0,0,1
4,1,0,1,0,1,0,0,0
...,...,...,...,...,...,...,...,...
14995,0,0,1,0,0,0,1,0
14996,1,1,0,0,1,1,0,0
14997,1,1,0,1,1,1,1,0
14998,1,1,1,1,0,0,1,1


In [None]:
knn_score = calculate_f1(knn_df)
knn_score

### K-Means

In [None]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import RidgeClassifier
from sklearn.cluster import KMeans

class kmeans_clustering():

        
    def __init__(self,X_train,X_test,y_train,y_test):
        self.X_train, self.X_test, self.y_train, self.y_test = pd.DataFrame(X_train),pd.DataFrame(X_test),pd.DataFrame(y_train),pd.DataFrame(y_test)
    
    
    def classification(self, model=RidgeClassifier(random_state=42)):
        model.fit(self.X_train, self.y_train)
        predicted_values = model.predict(self.X_test)

        predicted_values_df = pd.DataFrame(predicted_values)
        predicted_values_df = predicted_values_df.rename(columns={0:'html_pred',1:'css_pred',2:'javascript_pred',3:'jquery_pred'})
        true_values_df = test_df[['html','css','javascript','jquery']]

        predicted_values_df.reset_index(drop=True,inplace=True)
        true_values_df.reset_index(drop=True,inplace=True)

        concatenated_df = pd.concat([predicted_values_df, true_values_df], axis=1)
        true_values_df = test_df[['html','css','javascript','jquery']]

        predicted_values_df.reset_index(drop=True,inplace=True)
        true_values_df.reset_index(drop=True,inplace=True)

        k_means_df = pd.concat([predicted_values_df, true_values_df], axis=1)

        k_means_score = calculate_f1(k_means_df)

        print('Accuracy: {}'.format(k_means_score))


    def k_means(self):
        n_clusters = 4
        clf = KMeans(n_clusters = n_clusters, random_state=42)
        clf.fit(self.X_train)
        y_labels_train = clf.labels_
        y_labels_test = clf.predict(self.X_test)

        self.X_train[300] = y_labels_train
        self.X_test[300] = y_labels_test
        return self

In [None]:
kmeans_clustering(X_train_vect_avg,X_test_vect_avg,train_labels,test_labels).k_means().classification()

