In [2]:
#connect to google drive
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/Colab Notebooks/andmeteadus/projekt")

Mounted at /content/drive


In [3]:
#imports
import pandas as pd
import regex as re
import math

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

##Read data from files and preprocess the data

In [4]:
#read datasets and merge train and test datasets together (into dataset called traintest)
uniqueTypes = pd.read_csv('unique_types.csv')
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sampleSubmission = pd.read_csv('sample_submission.csv')

train["dataset"] = "train"
test["dataset"] = "test"

traintest = pd.concat([train, test])
traintest['parameter_unit'] = traintest['parameter'].str.cat(traintest['unit'], sep = "_")

In [5]:
# Returns the dataframe, making 1-hot vectors of values in given column that appear at least the number of times given as the limit
def make1hotsOfColumnValues(df, column, limit):
    replaceable = []
    for el in df[column].unique():
        if df[df[column] == el].shape[0] < limit:
            replaceable.append(el)
    df[column] = df[column].replace(replaceable, 'PihlaJaHannaProjekt')
    newDf= pd.get_dummies(df[[column]], columns=[column]) #dataframe that contains the new columns only
    df=pd.concat([newDf, df], axis=1)
    df=df.drop([column], axis=1)#remove the original column
    return df

In [6]:
#removes short words from the given list and returns the list but with all words lowercased
def removeShortWordsAndLower(words):
  newWords = []
  for w in words:
    if len(w)>1:
      newWords.append(w.lower())
  return newWords


#returns list of most frequent words in one column. (these words can appear in text of column values)
def findFrequentWords(df, col, howMany):
  unnecessarySymbols="[ ,.;:()\[\]\"\'\\n]+"
  wordFreqs = dict()
  for index, row in df.iterrows():
    colValue = row[col]
    if(type(colValue)!=float or not math.isnan(row[col])):
      rowWords = re.split(unnecessarySymbols, row[col])
      rowWords = removeShortWordsAndLower(rowWords)
      for w in rowWords:
        if(w not in wordFreqs):
          wordFreqs[w] = 0
        wordFreqs[w] += 1
  
  freqValues = [k for k, v in sorted(wordFreqs.items(), key=lambda item: item[1], reverse = True)]
  return freqValues[:howMany]


#makes new columns for most frequent words in given column and fills the columns with 1 and 0 values
def makeBinaryColumnsOfFrequentWords(df, col, howMany):
  freqWords = findFrequentWords(df, col, howMany)

  for w in freqWords:
    wordInColumnBools = [] #booleans as integers
    for index, row in df.iterrows():
      isInColumn = False
      colValue = row[col]
      if(type(colValue)!=float or not math.isnan(row[col])):
        isInColumn = w in colValue.lower()
      wordInColumnBools.append(int(isInColumn))

    newCol = col+"_"+w
    df.insert(2, newCol, wordInColumnBools)



In [7]:
#make list of unique types
uniqueTypesList = []
for index, row in uniqueTypes.iterrows():
  uniqueTypesList.append(row["type"])

In [8]:
columns_to_1hot_with_limits = [('name', 8), ('material', 4), ('commentary', 5), ('event_type', 10), ('location', 4), ('start', 10), ('end', 8), ('country_and_unit', 10), ('before_Christ', 1), ('participants_role', 1), ('participant', 3), ('text', 3), ('class', 1), ('technique', 10), ('parameter_unit', 1), ('value', 2), ('museum_abbr', 10), ('musealia_mark', 1), ('musealia_additional_nr', 1), ('collection_mark', 11), ('collection_additional_nr', 1), ('legend', 22), ('initial_info', 7), ('damages', 4), ('state', 3), ('color', 1), ('additional_text', 11)]
columns_containing_useful_free_text = ['name', 'commentary', 'text', 'technique', 'legend', 'initial_info', 'additional_text']

columns_to_drop = ['full_nr', 'parish', 'element_count', 'ks', 'musealia_seria_nr', 'musealia_queue_nr', 'collection_queue_nr', 'is_original', 'parameter', 'unit']

for col in columns_to_drop:
    traintest = traintest.drop(col, axis=1)

In [9]:
howMany = 50

#make binary columns of frequent words in columns listed in columns_containing_useful_free_text
for i in range(len(columns_containing_useful_free_text)):
    print(columns_containing_useful_free_text[i]) # to keep track of how far we are
    makeBinaryColumnsOfFrequentWords(traintest, columns_containing_useful_free_text[i], howMany)

# estimated running times for this block with different howMany values
# howMany = 10 - 1 min
# howMany = 30 - 4 min
# howMany = 50 - 6 min
# howMany = 100 - 13 min

name
commentary
text
technique
legend
initial_info
additional_text


In [10]:
#make 1-hot vectors of frequent values in columns listed in columns_to_1hot_with_limits
#this block runs 1-2 minutes
for i in range(len(columns_to_1hot_with_limits)):
    traintest = make1hotsOfColumnValues(traintest, columns_to_1hot_with_limits[i][0], columns_to_1hot_with_limits[i][1])

In [11]:
traintest = traintest.fillna(0)

##Save preprocessed data to file or read it from file

In [None]:
filename = "preprocessedData.csv"
#uncomment as needed

#traintest.to_csv(filename, index=False) #saves data to file
#traintest = pd.read_csv(filename) #loads data from file

##Separating original trainset and testset

In [12]:
train = traintest[traintest["dataset"]=="train"]
test = traintest[traintest["dataset"]=="test"]

train = train.drop("dataset", axis=1)
test = test.drop("dataset", axis=1)

In [13]:
columnsX=list(train.columns)
columnsX.remove("type")

##Creating validation set and using it to find the best model


In [15]:
#separating trainset into training and validation data
x_train_val, x_val, y_train_val, y_val = train_test_split(train[columnsX], train[["type"]], random_state=1, train_size=0.75)

In [16]:
#in this block you can test different models and find their accuracy on the validation set
dtc =  DecisionTreeClassifier(random_state=8, max_depth=100, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0000001, max_features=None, criterion="entropy") #accuracy 0.904
rf = RandomForestClassifier(n_estimators=50, criterion="entropy", max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features="log2") #accuracy 0.909
#gnb = GaussianNB() #accuracy 0.3
#knn = KNeighborsClassifier(n_neighbors=20)# n = 5 and n = 3 accuracy 0.6,  n=20 accuracy 0.56
#lr = LogisticRegression() #accuracy 0.27

model = dtc
model.fit(x_train_val, y_train_val)
y_pred = model.predict(x_val)
acc = accuracy_score(y_val, model.predict(x_val))#soon võiks ka olla ypred
print(acc)

#rf 90.1 kui limit/howMany oli 50. 89 kui limit 10 ja 90.2 kui limit on 100. aga randoomsus on ka mängus. 30-ga 90.5, 89.7, 89.9
#dtc 89 vist kui limit/howmany 50 ja sama kui howmany on 10. 90.3 kui howmany on 100. 3+-ga 89.6, 90.0, 89.5


0.9031428571428571


##Training on the whole trainset, predicting the results on testset and saving the predictions

In [17]:
x_train=train[columnsX]
y_train=train["type"]

x_test=test[columnsX]

model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [18]:
# SAVING RESULTS TO FILE
predictions_df = pd.DataFrame(y_pred, columns = ["type"])
df_pred = test[["id"]]
#df_pred["type"] = predictions_df["type"] #rf
df_pred.insert(1, "type", y_pred) #dtc
df_pred.to_csv("predictions.csv", index=False)

## code to tune the hyperparameters for decision tree and random forest

In [20]:
#rf. 5 min 
ne = [10, 50] #n_estimators                  1
cr = ["gini", "entropy"] #criterion   2 
md = [None, 100, 10] #max_depth                3
mss = [2, 3] #min_samples_split                   4
msl = [1, 10] #min_samples_leaf                5
mf = ["log2", "sqrt"]#max_features          6

rfTuning = pd.DataFrame(columns=["ne", "cr","md", "mss","msl", "mf", "accuracy"])

bestModel=None
maxAcc=0.0
maxAccParams=[]
for i1 in ne:
  print(i1)
  for i2 in cr:
    print(i2)
    for i3 in md:
      for i4 in mss:
        for i5 in msl:
          for i6 in mf:
            model = RandomForestClassifier(random_state=5, n_estimators=i1, criterion=i2, max_depth=i3, min_samples_split=i4, min_samples_leaf=i5, max_features=i6)
            model = model.fit(x_train_val, y_train_val)
            acc = accuracy_score(y_val, model.predict(x_val))

            rfTuning = rfTuning.append({"md": i1, "mss": i2, "msl": i3, "mwfl": i4, "mf": i5, "c": i6, 'täpsus': acc }, ignore_index=True)
            if(acc>maxAcc):
              maxAcc=acc
              maxAccParams=[i1, i2, i3, i4, i5, i6]
              bestModel=model

p=maxAccParams
print("The best model was with parameters:  ")
print("n_estimators="+str(p[0])+", criterion="+str(p[1])+", max_depth="+str(p[2])+", min_samples_split="+str(p[3])+", min_samples_leaf="+str(p[4])+", max_features="+str(p[5]))
print("and accuracy: "+str(maxAcc))

rfTuning.to_csv("rf_mudelite_katsed_limit50.csv", index=False)

10
gini


  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y

entropy


  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y

50
gini


  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y

entropy


  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y_train_val)
  model = model.fit(x_train_val, y

The best model was with parameters:  
n_estimators=50, criterion=entropy, max_depth=None, min_samples_split=2, min_samples_leaf=1, max_features=log2
and accuracy: 0.9068571428571428


In [None]:
md = [2, 3, 4, 5, 10, 15]#max_depth       1
mss = [2, 3, 4, 6]#min_samples_split      2
msl = [1, 2, 3, 5, 7]#min_samples_leaf    3
mwfl = [0.0, 0.0000001, 0.01, 0.05, 0.1]#min_weight_fraction_leaf   4
mf = [1, 2, 3, 4, None, "sqrt"]#max_features    5
c = ["gini", "entropy"]#criterion   6


dtcTuning = pd.DataFrame(columns=["md", "mss","msl", "mwfl","mf", "c", "täpsus"])

bestModel=None
maxAcc=0.0
maxAccParams=[]
for i1 in md:
  for i2 in mss:
    for i3 in msl:
      for i4 in mwfl:
        for i5 in mf:
          for i6 in c:
            model = DecisionTreeClassifier(random_state=5, max_depth=i1, min_samples_split=i2, min_samples_leaf=i3, min_weight_fraction_leaf=i4, max_features=i5, criterion=i6)
            model = model.fit(x_train_val, y_train_val)
            acc = accuracy_score(y_val, model.predict(x_val))

            dtcTuning = dtcTuning.append({"md": i1, "mss": i2, "msl": i3, "mwfl": i4, "mf": i5, "c": i6, 'täpsus': acc }, ignore_index=True)
            if(acc>maxAcc):
              maxAcc=acc
              maxAccParams=[i1, i2, i3, i4, i5, i6]
              bestModel=model

p=maxAccParams
print("The best model was with parameters:  ")
print("max_depth="+str(p[0])+", min_samples_split="+str(p[1])+", min_samples_leaf="+str(p[2])+", min_weight_fraction_leaf="+str(p[3])+", max_features="+str(p[4])+", criterion="+str(p[5]))
print("and accuracy: "+str(maxAcc))

#since it had ran for over an hour, we decided to interrupt the cell and took the best model we had discovered at that time (from the dtcTuning dataframe). Turned out we had looked through about 90% of the models this code creates.