In [10]:
import time, requests, json

from datetime import datetime
from bs4 import BeautifulSoup

from tqdm import tqdm, tqdm_notebook

import pandas as pd
import numpy as np


report_dir = "./report/"
log_dir = "./log/"

In [3]:
#Scrapping Transfermarkt to get all the players names

def get_scrapped_names_tfmarkt(url, log=False):
    if log:
        print("Scrapping Transfermarkt for players of target club")
        
    page = requests.get(url,  headers={'User-Agent': 'Mozilla/5.0'}).content
    soup = BeautifulSoup(page, "lxml")

    names = []
    
    soup_table = soup.find("div", {"id" : "yw1"}).find("table")
    
    for td in soup_table.findAll("td", {"class" : "hauptlink"}):
        if 'rechts' not in td.attrs['class']:
            names.append(td.find("div").text)
        
    return names

In [4]:
#Using CIES hook to get player estimated values from last 12 months

def get_report_from_cies(names, match_club, delay=1, log=False):
    hook = "http://prod.fo-pfpo.iad-informatique.com/ratings/web/services?method=searchPlayers&key={}"

    tuples = []
    if log:
        print("Associating each player from {} with its id in CIES database".format(match_club))
    for number, name in enumerate(tqdm_notebook(names)):
        url = hook.format(name)
        response = requests.get(url).content

        ids = json.loads(response)

        #if name is not found using complete name
        if(isinstance(ids, list)):

            #Use only a partial name
            for split in reversed(name.split()):
                url = hook.format(split)
                response = requests.get(url).content 

                ids = json.loads(response)

                if not isinstance(ids, list):
                    break
                
            #If name is still not found, it needs human intervention to rework the file
            if isinstance(ids, list):
                #raise the flag that indicates a rework needed
                tuples.append((name, None, None, response , False))
                time.sleep(delay)
                continue

        response_ids = list(ids.keys())

        response_clubs = []
        response_names = []

        #splitting string "name (club)" into two lists of names and clubs
        for val in ids.values():
            response_clubs.append(val[val.find("(")+1:val.find(")")])
            response_names.append(val[:val.find("(")-1])

        #if expected club if found then remove all the options
        if match_club in response_clubs:
            index = response_clubs.index(match_club) 

            response_clubs = response_clubs[index]
            response_names = response_names[index]
            response_ids = response_ids[index]
            match_found = True

        #else it needs human intervention to rework the file
        else:
            #rework needed flag
            match_found = False


        tuples.append((name,response_clubs, response_names, response_ids, match_found))
        #delay to avoid DDOS and overuse of CIES server
        time.sleep(delay)
    
    return pd.DataFrame(tuples, columns=["Nom", "json_clubs", "json_names", "json_ids", "NoIssue"])

In [5]:
def get_all_team_values_cies(df_json, delay=1, period = 4, log=False):
    hook = "http://prod.fo-pfpo.iad-informatique.com/widget/transfertValuesView/en/{}?P_change=P1"

    mr_value_array = []
    target_value_array = []

    pd_mr_label = None
    pd_target_label = None

    none_array = []

    if log:
        print("Getting all the CIES values for each player")
    for idx in tqdm_notebook(df_json['json_ids']):
        url  = hook.format(idx)
        page = requests.get(url).content
        soup = BeautifulSoup(page, "lxml")



        div =  soup.find("div", {"id" : "content"})
        graph_data = json.loads(div["graph-data"])
        cies_values = graph_data["data"]
        cies_labels = graph_data["labels"][:-1]

        target_label = "{:02d}/{}".format(int(cies_labels[-1].split("/")[0])-4,
                                        cies_labels[-1].split("/")[1])
        target_value = cies_values[-1-period]

        most_recent_value = cies_values[-1]
        most_recent_label = cies_labels[-1]

        none_bool = False if (target_value is None or most_recent_value is None) else True

        if pd_mr_label is None :
            pd_mr_label = most_recent_label 
            pd_target_label = target_label 


        mr_value_array.append(most_recent_value)
        target_value_array.append(target_value)


        none_array.append(none_bool)

        time.sleep(delay)
        
    return df_json.assign(**{pd_target_label : np.array(target_value_array), 
                pd_mr_label : np.array(mr_value_array),
                "HasData" : none_array})

In [6]:
def get_human_report(df_json):
    filtered_df = df_json[df_json["HasData"]]
    final_df = filtered_df.drop(["json_clubs", "json_names", "json_ids", "NoIssue", "HasData"], axis = 1)
        
    return final_df

def export_df_cies(df_json):
    None
    #placeholder

In [6]:
names = get_scrapped_names_tfmarkt(url="https://www.transfermarkt.fr/olympique-lyon/startseite/verein/1041")

df_json = get_report_from_cies(names, match_club = "Olympique Lyonnais")

#Export file as CSV (so that file can be reworked if needed)
#df_json

df_copy = get_all_team_values_cies(df_json)

final_df = get_human_report(df_copy)
final_df

HBox(children=(IntProgress(value=0, max=27), HTML(value='')))




HBox(children=(IntProgress(value=0, max=27), HTML(value='')))




Unnamed: 0,Nom,json_clubs,06/2018,10/2018
0,Anthony Lopes,Olympique Lyonnais,17.0,17.2
1,Mathieu Gorgelin,Olympique Lyonnais,1.1,1.3
2,Anthony Racioppi,Olympique Lyonnais,1.6,1.7
3,Marcelo,Olympique Lyonnais,9.2,10.7
5,Mapou Yanga-Mbiwa,Olympique Lyonnais,4.5,2.8
6,Oumar Solet,Olympique Lyonnais,1.8,3.0
7,Jérémy Morel,Olympique Lyonnais,2.5,2.6
8,Ferland Mendy,Olympique Lyonnais,11.3,21.0
9,Marçal,Olympique Lyonnais,2.8,2.5
10,Léo Dubois,Olympique Lyonnais,0.0,11.6


In [7]:
def generate_pre_report_cies(target_club, url_tfmarkt):


    names = get_scrapped_names_tfmarkt(url=url_tfmarkt, log=True)

    df_json = get_report_from_cies(names, match_club = target_club, log=True)

    #Export file as CSV (so that file can be reworked if needed)
    #logging
    df_json.to_csv(log_dir + "test.csv", index=False)
    
    return df_json

def generate_human_report_cies(df_json, target_club):
    df_copy = get_all_team_values_cies(df_json[df_json["NoIssue"]], log=True)

    final_df = get_human_report(df_copy)

    filename =  datetime.now().strftime("%d-%m-%Y_%H%M%S-{}.csv").format(target_club)
    final_df.to_csv(report_dir + filename)

    return final_df



Scrapping Transfermarkt for players of target club
Associating each player from Olympique de Marseille with its id in CIES database


HBox(children=(IntProgress(value=0, max=29), HTML(value='')))

Duje ----- Duje Caleta-Car
out



In [68]:
z = True
z &= False
z

False

In [64]:
d = [1,2,3]
not(np.all(d == None))

True

In [38]:
import numpy as np
import pandas as pd


In [41]:
a =  pd.DataFrame(np.array([[4,5,5],[5,6,8]]))
a

Unnamed: 0,0,1,2
0,4,5,5
1,5,6,8


In [42]:
a.assign(**{np.array("test", "test2") :  np.array([["lol", "lol"],["lal", "lal"]])})

TypeError: data type "test2" not understood

In [43]:
a = ['06/2018', '01/2018']
b = [['17.0', '20.5'], ['1.1', '0.9'], ['1.6', '0.9'], ['9.2', '7.5'], [None, None], ['4.5', '8.3'], ['1.8', '2.1'], ['2.5', '2.3'], ['11.3', '4.4'], ['2.8', '3.9'], ['0.0', '3.0'], ['4.8', '5.5'], ['12.0', '12.2'], ['41.8', '37.4'], ['45.4', '12.4'], ['36.8', '9.4'], ['2.4', '3.8'], ['4.8', '4.3'], [None, None], ['61.2', '60.7'], ['69.8', '42.3'], ['8.3', '3.8'], [None, None], ['49.7', '22.2'], ['15.5', '23.0'], [None, None], ['3.5', None]]

In [44]:
print(a)
print(b)

['06/2018', '01/2018']
[['17.0', '20.5'], ['1.1', '0.9'], ['1.6', '0.9'], ['9.2', '7.5'], [None, None], ['4.5', '8.3'], ['1.8', '2.1'], ['2.5', '2.3'], ['11.3', '4.4'], ['2.8', '3.9'], ['0.0', '3.0'], ['4.8', '5.5'], ['12.0', '12.2'], ['41.8', '37.4'], ['45.4', '12.4'], ['36.8', '9.4'], ['2.4', '3.8'], ['4.8', '4.3'], [None, None], ['61.2', '60.7'], ['69.8', '42.3'], ['8.3', '3.8'], [None, None], ['49.7', '22.2'], ['15.5', '23.0'], [None, None], ['3.5', None]]


In [46]:
pd.DataFrame(data=b, columns=a).assign(**{"HasData": [True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, False, True, True, False, False]})



Unnamed: 0,06/2018,01/2018,HasData
0,17.0,20.5,True
1,1.1,0.9,True
2,1.6,0.9,True
3,9.2,7.5,True
4,,,False
5,4.5,8.3,True
6,1.8,2.1,True
7,2.5,2.3,True
8,11.3,4.4,True
9,2.8,3.9,True


In [57]:
max(-5,0)

0