In [15]:
# imbalanced learning
import imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler, NearMiss

In [16]:
from sklearn import svm
from sklearn.datasets import make_classification
import json
import numpy as np

In [17]:
text = []
label = []
# 0 machine model1, 1 human
with open("../data/domain1_train.json") as f:
    for line in f:
        # read line by line
        data = json.loads(line)
        
        # add values
        text.append(data["text"])
        label.append(data["label"])

In [18]:
# 2-8 machine 1 human
with open("../data/domain2_train.json") as f:
    for line in f:
        # read line by line
        data = json.loads(line)
        
        # add values
        text.append(data["text"])
        if "model" in data.keys():
            label.append(data["model"]+2)
        else:
            label.append(1)

In [19]:
vector_sample = np.arange(5000)
from sklearn.feature_extraction.text import CountVectorizer

def toStr(n):
   return str(n)

# Create a Vectorizer Object
vectorizer = CountVectorizer(preprocessor= toStr, analyzer="word", token_pattern=r"(?u)\b\w+\b")

vectorizer.fit(vector_sample)

# Printing the identified Unique words along with their indices
print("Vocabulary: ", vectorizer.vocabulary_)

# Encode the Document
vector = vectorizer.transform(text)
# Summarizing the Encoded Texts
print("Encoded Document is:")
print(vector.toarray())

Vocabulary:  {'0': 0, '1': 1, '2': 1112, '3': 2223, '4': 3334, '5': 4445, '6': 4556, '7': 4667, '8': 4778, '9': 4889, '10': 2, '11': 113, '12': 224, '13': 335, '14': 446, '15': 557, '16': 668, '17': 779, '18': 890, '19': 1001, '20': 1113, '21': 1224, '22': 1335, '23': 1446, '24': 1557, '25': 1668, '26': 1779, '27': 1890, '28': 2001, '29': 2112, '30': 2224, '31': 2335, '32': 2446, '33': 2557, '34': 2668, '35': 2779, '36': 2890, '37': 3001, '38': 3112, '39': 3223, '40': 3335, '41': 3446, '42': 3557, '43': 3668, '44': 3779, '45': 3890, '46': 4001, '47': 4112, '48': 4223, '49': 4334, '50': 4446, '51': 4457, '52': 4468, '53': 4479, '54': 4490, '55': 4501, '56': 4512, '57': 4523, '58': 4534, '59': 4545, '60': 4557, '61': 4568, '62': 4579, '63': 4590, '64': 4601, '65': 4612, '66': 4623, '67': 4634, '68': 4645, '69': 4656, '70': 4668, '71': 4679, '72': 4690, '73': 4701, '74': 4712, '75': 4723, '76': 4734, '77': 4745, '78': 4756, '79': 4767, '80': 4779, '81': 4790, '82': 4801, '83': 4812, '84':

In [20]:
X = vector.toarray()
y = np.array(label).ravel()

print("Shape of X:", X.shape)
print("Shape of y1", y.shape)

Shape of X: (34400, 5000)
Shape of y1 (34400,)


In [21]:
y

array([1, 1, 1, ..., 3, 6, 8])

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
ros = RandomOverSampler(random_state=None)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [24]:
smote = SMOTE(random_state=None)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [25]:
rus = RandomUnderSampler(random_state=None)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [26]:
nm = NearMiss(version = 3)
X_train_nm, y_train_nm = nm.fit_resample(X_train, y_train)



In [36]:
#model_plain = svm.SVC()
#model_plain.fit(X_train, y_train)

In [50]:
#model_rus = svm.SVC()
#model_rus.fit(X_train_rus, y_train_rus)

In [None]:
model_ros = svm.SVC()
model_ros.fit(X_train_ros, y_train_ros)

In [37]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score

In [38]:
# 'model_plain','model_ros','model_smote','model_rus','model_nm'
# model_plain,model_ros,model_smote,model_rus,model_nm
model_name = ['model_ros']
model = [model_ros]
    
for index in range(len(model_name)):
    print("processing model:",model_name[index])
    pred_result = model[index].predict(X_test)
        
    accuracy = accuracy_score(y_test, pred_result)
    recall = recall_score(y_test, pred_result, average='macro')
    precision = precision_score(y_test, pred_result, average='macro')
    print("accuracy = ", accuracy, "reall = ",recall, "precision = ",precision)

processing model: model_plain
accuracy =  0.673546511627907 reall =  0.480946856512382 precision =  0.4781571398793323


In [52]:
text = []

with open("../data/test_set.json") as f:
    for line in f:
        # read line by line
        data = json.loads(line)
        
        # add values
        text.append(data["text"])

In [53]:
# Encode the Document
vector = vectorizer.transform(text)

# Summarizing the Encoded Texts
print("Encoded Document is:")
print(vector.toarray())

X = vector.toarray()

print("Shape of X:", X.shape)

Encoded Document is:
[[32 14  3 ...  0  0  0]
 [ 2 11  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 7  1  0 ...  0  0  0]
 [27 18  6 ...  0  0  0]]
Shape of X: (1000, 5000)


In [57]:
import pandas as pd
y_result = model_ros.predict(X)
for i in range(len(y_result)):
    if y_result[i] == 1:
        y_result[i] = 1
    else:
        y_result[i] = 0
        
print(y_result)


[0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 1 1 1 0 0 0 1 0 1
 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 1 1 1 1 0 1 0 0 0 1 1 0 1 1 0 0 1 1 0 1 1 0 0 0 0 1 0 1 0 0 0 1 1 1 0
 0 0 0 0 0 1 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0 1 0 0 1 0 0 1 0 0
 1 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 0 1 0 1 1 0 0 1 0 0 1 0 0 0 0 0 0 1 1 0
 0 0 0 0 1 1 1 0 1 1 0 1 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1
 1 0 0 0 0 1 0 1 0 1 0 0 1 1 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0
 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 1 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0
 0 0 1 0 1 0 1 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 0 1 1 0 0 0 1 1 0 1 0 1
 0 1 1 0 0 0 1 1 1 0 0 1 0 1 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0
 1 0 0 1 0 0 0 0 0 0 0 0 0 1 1 0 1 1 1 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 1 1 0 0 0 1 1 1 1 0 1 0 0 0 0 0 1 0 0 0 0 1 1 1 0 0 0 0 0 0 1 0
 0 0 0 0 1 0 0 1 0 0 0 1 

In [58]:
y_1 = list(y_result)
y_1.count(1)

301

In [49]:

result = pd.DataFrame({"class":y_result}).reset_index().rename(columns = {'index':'id'})
result.to_csv("./svmdomain1and2_rus.csv", index=False)