# Description
This file can by used for insterting new match data such as: match_id, name of file containing tweets,  names of teams, date and time for match start, first half end, second half start and match end, and also hashtags for both teams. The data is added to a csv file - match_data.csv, which is used for tweets scrapping and analyses of the tweets.

Second part of this file is used for inserting new match details - minutes of goals, cards receiving and other important actions to match_details.csv file.

In [1]:
# Run this cell just once! (or restart Kernel before second time)
# Set the main directory as the working directory


import os

os.chdir('..')


In [2]:
import pandas as pd

from utils.fixed import *


In [3]:
def check_if_empty(data_dict):
    for value in data_dict.values():
        if value == '':
            return False
    return True

def check_id(data_dict, dataframe):
    if data_dict['match'] in list(dataframe['match']):
        return False
    else:
        return True

def check_data_format(data_dict):
    date_pattern = re.compile("^\d{4}\-\d{2}\-\d{2} \d{2}:\d{2}:\d{2}$")
    match_pattern = re.compile("^\w{3}\-\w{3}")
    file_name_pattern = re.compile("^[\w_]+\.csv$")
    team_pattern = re.compile("^[\w\s]+$")
    hashtags_pattern = re.compile("^#[a-zA-Z0-9]+$")
    if not re.match(match_pattern, data_dict['match']):
        print("INCORRECT MATCH ID")
        return False
    if not re.match(file_name_pattern, data_dict['file_name']):
        print("INCORRECT FILE NAME")
        return False
    if not re.match(team_pattern, data_dict['team1']):
        print("INCORRECT TEAM1 NAME")
        return False
    if not re.match(team_pattern, data_dict['team2']):
        print("INCORRECT TEAM2 NAME")
        return False
    if not re.match(date_pattern, data_dict['match_start']):
        print("INCORRECT DATE - match_start")
        return False
    if not re.match(date_pattern, data_dict['first_part_end']):
        print("INCORRECT DATE - first_part_end")
        return False
    if not re.match(date_pattern, data_dict['second_part_start']):
        print("INCORRECT DATE - second_part_start")
        return False
    if not re.match(date_pattern, data_dict['match_end']):
        print("INCORRECT DATE - match_start")
        return False
    for i in data_dict['hashtags_team1']:
        if not re.match(hashtags_pattern, i):
            print("INCORRECT HASHTAGS - hashtags_team1")
            return False
    for i in data_dict['hashtags_team2']:
        if not re.match(hashtags_pattern, i):
            print("INCORRECT HASHTAGS - hashtags_team2")
            return False
    return True


## SHOW CURRENT MATCH_DATA
Show the data which is currently saved in the match_data.csv file. This data is loaded to MATCH_DATA global variable.

In [4]:
MATCH_DATA


Unnamed: 0,match,file_name,team1,team2,match_start,first_part_end,second_part_start,match_end,hashtags_team1,hashtags_team2
0,PSG-LIV,PSG_LIV.csv,PSG,Liverpool,2018-09-18 19:00,2018-09-18 19:46,2018-09-18 20:01,2018-09-18 20:55,"['#psg', '#parissaintgermain']","['#liverpoolfc', '#liverpool', '#lfc']"
1,MAN-NEW,MAN_NEW.csv,Manchester,Newcastle,2018-10-06 16:30,2018-10-06 17:17,2018-10-06 17:32,2018-10-06 18:17,"['#mufc', '#manutd', '#manchesterunited', '#re...","['#newcastle', '#newcastleunited', '#nufc']"
2,LIV-MAN,LIV_MAN.csv,Liverpool,Manchester City,2018-10-07 15:30,2018-10-07 16:17,2018-10-07 16:32,2018-10-07 17:20,"[#liverpoolfc,#liverpool,#lfc]","[#manchestercity,#mancity,#manchestercityfootb..."
3,VAL-BAR,VAL_BAR.csv,Valencia,Barcelona,2018-10-07 18:45,2018-10-07 19:32,2018-10-07 19:47,2018-10-07 20:35,"[#valenciacf, #yocreoenesteequipo, #valencia, ...","[#fcbarcelona, #barca, #igersbarca, #forcabarc..."
4,LEI-EVE,LEI_EVE.csv,Leicester,Everton,2018-10-06 14:00,2018-10-06 14:48,2018-10-06 15:03,2018-10-06 15:53,"[#leicester, #leicestercity, #lcfc, #foxes, #b...","[#everton, #efc, #coyb, #evertonfc]"
5,FUL-ARS,FUL_ARS.csv,Fulham,Arsenal,2018-10-07 11:00,2018-10-07 11:47,2018-10-07 12:02,2018-10-07 12:51,"[#fulham, #ffc, #coyw]","[#arsenal, #afc, #coyg, #gunners, #arsenalfc]"
6,SOU-CHE,SOU_CHE.csv,Southampton,Chelsea,2018-10-07 13:15,2018-10-07 14:02,2018-10-07 14:17,2018-10-07 15:06,"[#saintsfc, #southamptonfc, #wemarchon]","[#chelseafc, #chelsea, #cfc, #cfcfamily, #cfcf..."


## ADDING NEW MATCH TO MATCH_DATA
Fill the empty strings with proper data - following the rules.
- match_id - unique value for each match. Format: 'team1-team2' , example: 'TOT-LIV', 'MAN-NEW'. This should be unique value in the dataframe, therefore if you get an error, please provide new id
- file name - csv file which contains tweets and other twitter data for each match. Example: TOT_LIV.csv
- team1 - the name of first team. Example: Tottenham, Liverpool
- team 2 - the name of second team.
- match_start - start time of a match. Format: 'yyyy-mm-dd hh:mm:ss' , example: '2018-10-06 17:32:00'
- first_part_end - end time of first half (with extra time). Format: 'yyyy-mm-dd hh:mm:ss' , example: '2018-10-06 17:32:00'
- second_part_start - start time of second half. Format: 'yyyy-mm-dd hh:mm:ss' , example: '2018-10-06 17:32:00'
- match_end - end time of a match (with extra time). Format: 'yyyy-mm-dd hh:mm:ss' , example: '2018-10-06 17:32:00'
- hashtags_team1 - hashtags defining first team, separated by comma. Format: '#hashtag1,#hashtag2' , example: '#psg,#parissaintgermain'
- hashtags_team2 - list of hashtags defining second team, separated by comma. Format: '#hashtag1,#hashtag2' , example: '#psg,#parissaintgermain'

In [5]:
match_id = 'SOU-CHE'
file_name = 'SOU_CHE.csv'
team1 = 'Southampton'
team2 = 'Chelsea'
match_start = '2018-10-07 13:15:00'
first_part_end = '2018-10-07 14:02:00'
second_part_start = '2018-10-07 14:17:00'
match_end = '2018-10-07 15:06:00'
hashtags_team1 = '#saintsfc,#southamptonfc,#wemarchon'
hashtags_team2 = '#chelseafc,#chelsea,#cfc,#cfcfamily,#cfcfans,#chelseafans,#coyb,#comeonyoublues,#theblues,#blueisthecolour'


In [6]:
new_data_dict = {'match': match_id,
                 'file_name': file_name,
                 'team1': team1,
                 'team2': team2,
                 'match_start': match_start,
                 'first_part_end': first_part_end,
                 'second_part_start': second_part_start,
                 'match_end': match_end,
                 'hashtags_team1': hashtags_team1.split(','),
                 'hashtags_team2': hashtags_team2.split(',')}


## ADD DATA TO match_data.csv FILE
The data is added after checking if:
- all parameters are added
- match_id is unique
- data are written in proper format

In [7]:
if check_id(new_data_dict, MATCH_DATA) & check_if_empty(new_data_dict) & check_data_format(new_data_dict):
    MATCH_DATA = MATCH_DATA.append(pd.DataFrame(list(new_data_dict.items())).set_index(0).T, ignore_index=True)
    MATCH_DATA.to_csv(MATCH_DATA_PATH, sep=';', encoding='utf-8')
else:
    print("INCORRECT DATA!!!")


INCORRECT DATA!!!


### CHECK THE MATCH_DATA DATAFRAME AFTER INSERTING NEW ROW

In [8]:
MATCH_DATA


Unnamed: 0,match,file_name,team1,team2,match_start,first_part_end,second_part_start,match_end,hashtags_team1,hashtags_team2
0,PSG-LIV,PSG_LIV.csv,PSG,Liverpool,2018-09-18 19:00,2018-09-18 19:46,2018-09-18 20:01,2018-09-18 20:55,"['#psg', '#parissaintgermain']","['#liverpoolfc', '#liverpool', '#lfc']"
1,MAN-NEW,MAN_NEW.csv,Manchester,Newcastle,2018-10-06 16:30,2018-10-06 17:17,2018-10-06 17:32,2018-10-06 18:17,"['#mufc', '#manutd', '#manchesterunited', '#re...","['#newcastle', '#newcastleunited', '#nufc']"
2,LIV-MAN,LIV_MAN.csv,Liverpool,Manchester City,2018-10-07 15:30,2018-10-07 16:17,2018-10-07 16:32,2018-10-07 17:20,"[#liverpoolfc,#liverpool,#lfc]","[#manchestercity,#mancity,#manchestercityfootb..."
3,VAL-BAR,VAL_BAR.csv,Valencia,Barcelona,2018-10-07 18:45,2018-10-07 19:32,2018-10-07 19:47,2018-10-07 20:35,"[#valenciacf, #yocreoenesteequipo, #valencia, ...","[#fcbarcelona, #barca, #igersbarca, #forcabarc..."
4,LEI-EVE,LEI_EVE.csv,Leicester,Everton,2018-10-06 14:00,2018-10-06 14:48,2018-10-06 15:03,2018-10-06 15:53,"[#leicester, #leicestercity, #lcfc, #foxes, #b...","[#everton, #efc, #coyb, #evertonfc]"
5,FUL-ARS,FUL_ARS.csv,Fulham,Arsenal,2018-10-07 11:00,2018-10-07 11:47,2018-10-07 12:02,2018-10-07 12:51,"[#fulham, #ffc, #coyw]","[#arsenal, #afc, #coyg, #gunners, #arsenalfc]"
6,SOU-CHE,SOU_CHE.csv,Southampton,Chelsea,2018-10-07 13:15,2018-10-07 14:02,2018-10-07 14:17,2018-10-07 15:06,"[#saintsfc, #southamptonfc, #wemarchon]","[#chelseafc, #chelsea, #cfc, #cfcfamily, #cfcf..."


## ADD MATCH DETAILS TO match_details.csv
Match details contain information from match course such as minutes of goals, important actions and receiving yellow or red card. This minutes are important to assess the method of sentiment prediction used for match analyses.

WARNING: The minutes added to match_deatails.csv need to include extra time in first and second half.

WARNING2: The match_id has to be unique!

In [9]:
match_id = 'SOU-CHE'
goals_team1 = ''
goals_team2 = '30, 59, 95 '
action_team1 = '26, 82'
action_team2 = '8'
cards_team1 = '47, 56, 61, 64, 68, 92'
cards_team2 = ''


In [15]:
new_details_dict = {'match': match_id,
                    'goals_team1': goals_team1,
                    'goals_team2': goals_team2,
                    'action_team1': action_team1,
                    'action_team2': action_team2,
                    'cards_team1': cards_team1,
                    'cards_team2': cards_team2}


In [13]:
match_detail_df

Unnamed: 0,match,goals_team1,goals_team2,action_team1,action_team2,cards_team1,cards_team2
0,PSG-LIV,"40, 84","30, 36, 93","17, 35","6, 8, 15, 59, 62, 73, 83",46,27
1,MAN-NEW,"73, 79, 93","7, 10","54, 55, 59, 79","35, 62","47, 66","61, 72"
2,LIV-MAN,,,"67, 70","14, 64, 78, 88",93,"21, 59, 67"
3,VAL-BAR,2,23,"6, 39",,"43, 60","41, 60"
4,LEI-EVE,40,"7, 79","34, 58, 95","9, 75, 87","31, 43, 65, 96",45
5,FUL-ARS,44,"29, 51, 69, 81, 93","3, 42, 75","33, 50","59, 72",
6,SOU-CHE,,"30, 59, 95","26, 82",8,"47, 56, 61, 64, 68, 92",


In [18]:
new_match_detail_row = pd.DataFrame(list(new_details_dict.items())).set_index(0).T
match_detail_df = pd.read_csv(MATCH_DETAILS_PATH, sep=';', encoding='utf-8', index_col=0)
if check_id(new_details_dict, match_detail_df):
    match_detail_df = match_detail_df.append(new_match_detail_row, ignore_index=True)
else:
    print("Match ID has to be unique! Change 'match_id' variable")


Match ID has to be unique! Change 'match_id' variable


### CHECK THE MATCH_DETAIL DATAFRAME AFTER INSERTING NEW ROW

In [14]:
match_detail_df

Unnamed: 0,match,goals_team1,goals_team2,action_team1,action_team2,cards_team1,cards_team2
0,PSG-LIV,"40, 84","30, 36, 93","17, 35","6, 8, 15, 59, 62, 73, 83",46,27
1,MAN-NEW,"73, 79, 93","7, 10","54, 55, 59, 79","35, 62","47, 66","61, 72"
2,LIV-MAN,,,"67, 70","14, 64, 78, 88",93,"21, 59, 67"
3,VAL-BAR,2,23,"6, 39",,"43, 60","41, 60"
4,LEI-EVE,40,"7, 79","34, 58, 95","9, 75, 87","31, 43, 65, 96",45
5,FUL-ARS,44,"29, 51, 69, 81, 93","3, 42, 75","33, 50","59, 72",
6,SOU-CHE,,"30, 59, 95","26, 82",8,"47, 56, 61, 64, 68, 92",


In [12]:
match_detail_df.to_csv(MATCH_DETAILS_PATH, sep=';', encoding='utf-8')
