In [47]:
import time
start_time = time.time()

import pandas as pd   # pandas for data frame organization
import numpy as np  # numpy for math stuff
from sklearn.linear_model import LogisticRegression  # Used Logistic Regression for prediction
from gensim.models.doc2vec import TaggedDocument
import nltk
import pickle
import random
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error

print("--- %s seconds ---" % (time.time() - start_time))


punctuations = [",",".","(",")","-",".",'/'] #list of punctuations, expand if neccessary
new_model,logreg,sortedCompany = pickle.load(open("training_parameters.pkl","rb"))

lda_model_10, id2word, d_topic_10, converted_currency = pickle.load(open("stuff to run","rb"))
title_list = pickle.load(open("title list","rb"))
stop_words = set(stopwords.words('english'))
nlp = pickle.load(open("spacy.pkl","rb"))

def tokenize_text(text):
        #token a string into individual words
        tokens = []
        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                if len(word) < 2:
                    continue
                tokens.append(word.lower())
        return tokens



def prediction(title):
    new_proj = title.lower()

    for punctuation in punctuations: #O(n)
            new_proj = " ".join(new_proj.split(punctuation))

    test_tagged = TaggedDocument(words=tokenize_text(new_proj), tags=['test'])
    
    X_test = (test_tagged.tags[0], new_model.infer_vector(test_tagged.words, epochs= 10000))


    # Predicted probability for each company to be involved in the new project
    y_prob = logreg.predict_proba([X_test[1]])[0]
    # print(y_prob)
    # Maximum probability (i.e the probability for most likely company)



    # Preparing the visualsation dataframe
    visData = pd.DataFrame({'Company':sortedCompany['Company'],'Probability':y_prob,"FG":sortedCompany['FG']})

    # We only plot company with more than 1 % chance of competing, since average about 3.9 per project. (would deem 1% to be reasonable) 
    plottingSet = visData.loc[visData['Probability'] >= 0.01]
    
    top5 = plottingSet.sort_values(by=['Probability'],ascending = False).head(5).astype(str)

    top5likely = list(top5['Company'].values)
    print(top5['FG'].values)
    print(list(zip(top5likely,top5['FG'].values)))
    return(list(zip(top5likely,top5['FG'].values)))

def get_value(sentence):

    predictions_for_amts_neo = clf_linear_reg.predict(x_test)

    test = np.array(get_topic(sentence))
    test = test.reshape(1,-1)
    return(round(max(float(clf_linear_reg.predict(test)),0),2),round(mean_absolute_error(y_test, predictions_for_amts_neo),2))

print("--- %s seconds ---" % (time.time() - start_time))

--- 0.0 seconds ---
--- 0.4617645740509033 seconds ---


In [48]:
import json
json.dumps(prediction("cloud service"))

['3' '2' '2' '2' '6']
[('starhub ltd.', '3'), ('wizlearn technologies pte. ltd.', '2'), ('ace-learning systems pte. ltd.', '2'), ('innov8te pte. ltd.', '2'), ('jardine onesolution(2001) pte ltd', '6')]


'[["starhub ltd.", "wizlearn technologies pte. ltd.", "ace-learning systems pte. ltd.", "innov8te pte. ltd.", "jardine onesolution(2001) pte ltd"], ["3", "2", "2", "2", "6"]]'

In [38]:
float([2,3,4])

TypeError: float() argument must be a string or a number, not 'list'

In [17]:
import pymongo
myclient = pymongo.MongoClient("mongodb://localhost:27017/") #Port
mydb = myclient["CompetitionIntellUpdated"]  #DB name
mycol = mydb["GeBizCollection"]  #Collection name

#Select only IT Service 408 entries out of 11k
myquery = { "ProcurementCategory": 'IT&Telecommunication ⇒ IT Services & Software Development '}  #Query
#print(myquery)
#myquery = {}
mydoc = mycol.find(myquery)
data = []
for x in mydoc:
    data.append(x)
for entry in data:
    if len(entry['Awards']) == 1:
        for i in range(len(entry['Respondents'])):
            entry[('Respondent'+str(i))] = entry['Respondents'][str(i)]['CompanyName']
            entry[('_Respondent'+str(i)+"Value")] = entry['Respondents'][str(i)]['TotalPrice']
        #print(entry['Awards'])
        entry['AwardedTo'] = entry['Awards'][str(0)]['AwardedTo']
        entry['AwardedValue'] = entry['Awards'][str(0)]['AwardedValue']
    elif len(entry['Awards']) > 1:
        #entry['AwardedTo'] = 'Multi'
        pass
        #data.remove(entry)
    else: # remove / just print anything for multiple award
        pass
        #data.remove(entry)
        
Rdata = []
for entry in data:
    try:
        for i in range(len(entry['Respondents'])):
            Rdata.append([entry['Respondents'][str(i)]['CompanyName'],entry['Respondents'][str(i)]['TotalPrice']])
    except:
        pass
Rdf = pd.DataFrame(Rdata)
Rdf.rename(columns={0:'Company',1:'Price'}, 
                 inplace=True)
def lowerCase(x):
    if isinstance(x, str):
        x = x.strip()
        return x.lower()
    return x

def convertToSGD(string):
    if isinstance(string, str):
        #print(string)
        x = string.split("(")
        amount = float(x[0])
        currency = x[1].strip()
        currency = currency[:-1]
        #print(amount)
        #print(currency)
        
        exchangeRate = {'sgd':1,'usd':1.36032,'eur':1.52292,
                        'myr':0.32874,'chf':1.33804,'gbp':1.76,
                        "aud":0.95,"cnh":0.2,"jpy":0.013,"qar":0.38,
                        "bnd":1,"aed":0.37,"nzd":0.9,"idr":0.000095,
                        "sek":0.14,"cad":1.02,"php":0.026,"inr":0.02}

        try:
            sgdValue = amount * exchangeRate[currency]
            return sgdValue
        except:
            try:
                currency = currency.split(")")[0]
                sgdValue = amount * exchangeRate[currency]
                return sgdValue
            except:
                print(currency)
def setFG(x):
    if x <= 100000:
        return 2
    elif x <= 250000:
        return 3
    elif x <= 500000:
        return 4
    elif x <= 1000000:
        return 5
    elif x <= 3000000:
        return 6
    elif x <= 5000000:
        return 7
    elif x <= 10000000:
        return 8
    elif x <= 30000000:
        return 9
    else:
        return 10
Rdf = Rdf.applymap(lowerCase)
Rdf['Price'] = Rdf['Price'].apply(convertToSGD)
Rdf = Rdf.dropna()
Company_data = Rdf.groupby('Company').max().reset_index()
Company_data['FG'] = Company_data['Price'].apply(setFG)
Company_data.set_index('Company',inplace=True)
test = pd.DataFrame(index=sortedCompany)
result = pd.concat([test, Company_data], axis=1, join='inner').reset_index()
result.rename(columns={"index":"Company"},inplace=True)
result

Unnamed: 0,Company,Price,FG
0,1-net singapore pte ltd,12400.00,2
1,1arche pte. ltd.,361000.00,4
2,360solutions business consultancy llp,40000.00,2
3,3d networks singapore pte. ltd.,1672546.00,6
4,3g global pte. ltd.,8700.00,2
5,a'cross media pte. ltd.,8800.00,2
6,a-sonic logistics pte. ltd.,69360.00,2
7,a-speed infotech pte. ltd.,17852.40,2
8,aam geospatial pte. ltd.,38000.00,2
9,aaron wills & co. private limited,68400.00,2


In [19]:
list(result['Company'])

['1-net singapore pte ltd',
 '1arche pte. ltd.',
 '360solutions business consultancy llp',
 '3d networks singapore pte. ltd.',
 '3g global pte. ltd.',
 "a'cross media pte. ltd.",
 'a-sonic logistics pte. ltd.',
 'a-speed infotech pte. ltd.',
 'aam geospatial pte. ltd.',
 'aaron wills & co. private limited',
 'abi-tech solution pte. ltd.',
 'accenture pte ltd',
 'acclivis technologies and solutions pte. ltd.',
 'accura infosys llp',
 'accuracy pte. ltd.',
 'ace-learning systems pte. ltd.',
 'acecom technologies pte ltd',
 'aceplp.com pte ltd',
 'acp computer training school pte. ltd.',
 'activate interactive pte ltd',
 'activeo singapore pte. ltd.',
 'aculearn pte ltd',
 'ad planet group pte ltd',
 'addest technovation pte ltd',
 'adelphi digital consulting group pte. ltd.',
 'ademco(far east) pte ltd',
 'admaterials technologies pte. ltd.',
 'adnovum singapore pte. ltd.',
 'advancedata network pte. ltd.',
 'aeterna solutions',
 'affle global pte. ltd.',
 'afizan pte ltd',
 'aftershock 

In [21]:
result['FG']

0       2
1       4
2       2
3       6
4       2
5       2
6       2
7       2
8       2
9       2
10      2
11     10
12      6
13      6
14      2
15      2
16      2
17      3
18      4
19      4
20      3
21      2
22      2
23      2
24      4
25      5
26      2
27      4
28      4
29      2
       ..
648     6
649     3
650     5
651     4
652     5
653     2
654     2
655     2
656     2
657     9
658     2
659     3
660     2
661     5
662     2
663     4
664     2
665     2
666     2
667     3
668     3
669     2
670     3
671     5
672     2
673     3
674     3
675     2
676     2
677     2
Name: FG, Length: 678, dtype: int64

In [23]:
pickle.dump((new_model,logreg,result),open("training_parameters.pkl","wb"))