In [1]:
## load packages
from bs4 import BeautifulSoup
import requests
import re
import os
import numpy as np
import pandas as pd

# Import and process

In [243]:
class Data:
    def __init__(self, url, url_bool = True):
        ## URL if url_bool = True else file
        self.url = url
        self.url_bool = url_bool
        ## names of items
        self.list_name = []
        ## status of items
        self.list_status = []
        ## total number of items
        self.list_count = []
        ## index of item for status
        self.list_idx = []
        ## link to confirmation
        self.list_link = []
    
    def get_soup(self):
        if self.url_bool:
            page = requests.get(self.url)
            self.soup = BeautifulSoup(page.content, 'html.parser')
        else:
            with open(self.url, encoding="utf8") as fp:
                self.soup = BeautifulSoup(fp, 'html.parser')
    
    def get_tags(self):
        ## needed for extract_info
        li = self.soup.findAll('li')
        self.list_total = [i.text.strip() for i in li if i.find('img', class_='thumbborder') is not None]
        self.list_href = [i.findAll('a', href=True) for i in li if i.find('img', class_='thumbborder') is not None]
        ## needed for add_groups
        self.list_h3 = self.soup.findAll('h3')
        self.list_tags = [tag.name for tag in self.soup.find_all(True)]
        
    def extract_info(self):
        for idx, entry in enumerate(self.list_total):
            ## take left of ':', remove leading/trailing space, split on first ' '
            temp1 = entry.split(':')[0].strip().split(' ', 1)

            ## turn string at front into int
            try:
                self.list_count.append(int(temp1[0]))
            ## some have extra char, so take 1st str if so
            except:
                self.list_count.append(int(temp1[0][0]))

            ## add item name
            self.list_name.append(temp1[1])

            ## take right of ':'
            temp2 = entry.split(':')[1].strip()
            ## take data between '()'
            temp2 = re.findall('\(.*?\)',temp2)
            ## split on last space to the right, remove '(' and ')'
            temp2 = [i.replace('(', '').replace(')', '').rsplit(' ', 1) for i in temp2]
            
            count = []
            status = []
            link = []

            for i, val in enumerate(temp2):
                ## some numbers concatenated together so find all numbers
                num = [int(s) for s in re.findall(r'\s|,|[^,\s]+', val[0]) if s.isdigit()]
                count.extend(num)
                ## get data after last space to the right
                status.extend([val[1].strip()] * len(num))
                ## add link to match the number of occurrences
                try:
                    link.extend([self.list_href[idx][i]['href']] * len(num))
                except:
                    print()
                    print(idx)
                    print(i)

            self.list_idx.append(count)
            self.list_status.append(status)
            self.list_link.append(link)

    def flatten_lists(self):
        names = [[self.list_name[i]] * len(j) for i, j in enumerate(self.list_status)]
        self.flat_name = [item for sublist in names for item in sublist]
        self.flat_status = [item for sublist in self.list_status for item in sublist]
        self.flat_index = [item for sublist in self.list_idx for item in sublist]
        self.flat_link = [item for sublist in self.list_link for item in sublist]
        self.flat_group = [item for sublist in self.list_groups for item in sublist]
        self.flat_country = [item for sublist in self.list_countries for item in sublist]
            
    def add_groups(self):
        ## find relative indices where text != '\n'
        idx_h3 = [m for m, n in enumerate(self.list_h3) if n.text != '\n']
        ## find text where text != '\n'
        name_h3 = [n.text for n in self.list_h3 if n.text != '\n']

        ## find all tags
        ## absolute find index of all h3 tags
        idx_tag = [idx for idx, tag in enumerate(self.list_tags) if tag == 'h3']
        ## retain only absolute indices where text != '\n'
        idx_h3 = [idx_tag[i] for i in idx_h3]

        ## find index of li tags
        idx_li = [n for n, tag in enumerate(self.list_tags) if tag == 'li']

        ## find text for all tags
        text = [name.text for name in self.soup.find_all(True)]

        ## find top-level tags where 'Russia' and 'Ukraine' occur
        idx_rus = [i for i in idx_h3 if 'Russia' in text[i]][0]
        idx_ukr = [i for i in idx_h3 if 'Ukraine' in text[i]][0]
        
        ## find all headings outside of those that contain 'Russia' and 'Ukraine'
        categories = [text[i] for i in idx_h3 if i not in [idx_rus, idx_ukr]]
        ## take info from heading before first '('
        name_cat = [entry.split('(')[0].strip() for entry in categories]
        
        list_indices = []
        list_country = []

        for i in range(len(idx_h3) - 1):
            if idx_h3[i] not in [idx_rus, idx_ukr]:
                list_indices.append([idx for idx in idx_li if idx > idx_h3[i] and idx < idx_h3[i + 1]])
                if idx_h3[i] > idx_ukr:
                    list_country.append('UKR')
                else:
                    list_country.append('RUS')

        ## for last entry
        list_country.append('UKR')

        idx_names = np.cumsum([len(i) for i in list_indices])

        group_names = []

        group_names.append(self.list_name[0:idx_names[0]])

        for i in range(len(idx_names) - 1):
            group_names.append(self.list_name[idx_names[i]:idx_names[i + 1]])

        ## for last entry
        last_entry = self.list_name[idx_names[-1]:-1]
        last_entry.append(self.list_name[-1])
        group_names.append(last_entry)
        
        i = 0

        ## iterate over list count to get individual entries
        self.list_groups = []
        self.list_countries = []

        for idx, groups in enumerate(group_names):
            for group in groups:
                self.list_groups.append([name_cat[idx]] * self.list_count[i])
                self.list_countries.append([list_country[idx]] * self.list_count[i])
                i += 1

    def find_error(self):
        list_range = []
        for j in self.list_count:
            list_range.append([i for i in range(1, j + 1)])

        idx_wrong = []
        for idx, entry in enumerate(list_range):
            if entry != self.list_idx[idx]:
                idx_wrong.append(idx)

        for i in idx_wrong:
            print(i)
            print(self.list_total[i])
            print(self.list_idx[i])
            print(len(self.list_idx[i]))
            print(self.list_count[i])
            print()

        return idx_wrong

In [3]:
url = 'https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html'
Oryx = Data(url)

Oryx.get_soup()
Oryx.get_tags()
Oryx.extract_info()

# Fix errors

In [18]:
idx_wrong = Oryx.find_error()

In [5]:
## fix errors
Oryx.list_idx[83][4] = 5
Oryx.list_idx[91][-1] = 2
Oryx.list_idx[103][2] = 3
Oryx.list_idx[103][3] = 4

In [6]:
## remove 2 duplicates in entries
n = 138
temp_idx = []
temp_status = []
temp_link = []

for i, j in enumerate(Oryx.list_idx[n]):
    if j not in temp_idx:
        temp_idx.append(j)
        temp_status.append(Oryx.list_status[n][i])
        temp_link.append(Oryx.list_link[n][i])
    else:
        print(j)

Oryx.list_idx[n] = temp_idx
Oryx.list_status[n] = temp_status
Oryx.list_link[n] = temp_link

34
56


In [7]:
## fix errors caused by duplicate numbers
n = 218

idx_wrong = [2, 4, 6, 8]

Oryx.list_idx[n] = [j for i, j in enumerate(Oryx.list_idx[n]) if i not in idx_wrong]
Oryx.list_status[n] = [j for i, j in enumerate(Oryx.list_status[n]) if i not in idx_wrong]
Oryx.list_link[n] = [j for i, j in enumerate(Oryx.list_link[n]) if i not in idx_wrong]

In [9]:
Oryx.find_error()

[]

# Add groups

In [24]:
Oryx.add_groups()

In [25]:
Oryx.flatten_lists()

In [26]:
print(len(Oryx.flat_name))
print(len(Oryx.flat_status))
print(len(Oryx.flat_index))
print(len(Oryx.flat_group))
print(len(Oryx.flat_country))
print(len(Oryx.flat_link))

32
32
32
32
32
32


In [27]:
df = pd.DataFrame({'name':Oryx.flat_name, 
                   'status':Oryx.flat_status, 
                   'idx': Oryx.flat_index, 
                   'group':Oryx.flat_group, 
                   'country':Oryx.flat_country, 
                   'link': Oryx.flat_link})

In [28]:
df[['group', 'country']].value_counts()

group                        country
Trucks, Vehicles and Jeeps   UKR        12
Radars                       UKR         5
Infantry Fighting Vehicles   UKR         4
Armoured Fighting Vehicles   RUS         2
Infantry Mobility Vehicles   RUS         2
Aircraft                     UKR         1
Armoured Fighting Vehicles   UKR         1
Armoured Personnel Carriers  UKR         1
Helicopters                  RUS         1
Infantry Mobility Vehicles   UKR         1
Tanks                        RUS         1
Trucks, Vehicles and Jeeps   RUS         1
dtype: int64

In [29]:
df.group.unique()

array(['Tanks', 'Armoured Fighting Vehicles',
       'Infantry Mobility Vehicles', 'Helicopters',
       'Trucks, Vehicles and Jeeps', 'Infantry Fighting Vehicles',
       'Armoured Personnel Carriers', 'Radars', 'Aircraft'], dtype=object)

In [30]:
## save by datemod meta tag
datemod = str(Oryx.soup.findAll('meta', itemprop="dateModified", content=True))
datemod = re.findall('"([^"]*)"', datemod)[0].replace(':', '')
print(datemod)

2022-02-24T122808Z


In [31]:
## output dataframe to CSV
# cwd = os.getcwd()
path = '../assets/' + datemod + '.csv'
df.to_csv(path, index=False)

In [32]:
print(datemod)

2022-02-24T122808Z


# Process from archive

In [None]:
with open('../assets/wayback_machine/crawled/file_names.txt', encoding="utf8") as file:
    list_wayback = [line.strip() for line in file]
    
list_wayback = list_wayback[0:-1]

In [37]:
list_wayback[0]

'20220224123534.snapshot'

In [42]:
file

'../assets/wayback_machine/crawled/20220316135717.snapshot'

In [298]:
for name in list_wayback:

    print(name)
    file = '../assets/wayback_machine/crawled/' + name
    Oryx = Data(file, url_bool=False)

    Oryx.get_soup()

    ## save by datemod meta tag
    datemod = str(Oryx.soup.findAll('meta', itemprop="dateModified", content=True))
    datemod = re.findall('"([^"]*)"', datemod)[0].replace(':', '')
    print(datemod)

    path = '../assets/' + datemod + '.csv'

    if os.path.exists(path):
        print('{0} exists...'.format(path))
        continue
    else:
        Oryx.get_tags()
        Oryx.extract_info()
        print()
        idx_wrong = Oryx.find_error()
        print(idx_wrong)
        
    temp = input()
    if temp == 'stop':
        break
    else:
        pass
    
    Oryx.add_groups()
    Oryx.flatten_lists()
    
    print(len(Oryx.flat_name))
    print(len(Oryx.flat_status))
    print(len(Oryx.flat_index))
    print(len(Oryx.flat_group))
    print(len(Oryx.flat_country))
    print(len(Oryx.flat_link))
    print()
    
    temp = input()
    if temp == 'stop':
        break
    else:
        pass

    df = pd.DataFrame({'name':Oryx.flat_name, 
                       'status':Oryx.flat_status, 
                       'idx': Oryx.flat_index, 
                       'group':Oryx.flat_group, 
                       'country':Oryx.flat_country, 
                       'link': Oryx.flat_link})
    
    df.to_csv(path, index=False)

20220224123534.snapshot
2022-02-24T122808Z
../assets/2022-02-24T122808Z.csv exists...
20220224144043.snapshot
2022-02-24T143801Z
../assets/2022-02-24T143801Z.csv exists...
20220224173139.snapshot
2022-02-24T170304Z
../assets/2022-02-24T170304Z.csv exists...
20220224191858.snapshot
2022-02-24T191637Z
../assets/2022-02-24T191637Z.csv exists...
20220224231142.snapshot
2022-02-24T225309Z
../assets/2022-02-24T225309Z.csv exists...
20220225005420.snapshot
2022-02-25T001319Z
../assets/2022-02-25T001319Z.csv exists...
20220225023717.snapshot
2022-02-25T001319Z
../assets/2022-02-25T001319Z.csv exists...
20220225035215.snapshot
2022-02-25T001319Z
../assets/2022-02-25T001319Z.csv exists...
20220225043350.snapshot
2022-02-25T001319Z
../assets/2022-02-25T001319Z.csv exists...
20220225083745.snapshot
2022-02-25T001319Z
../assets/2022-02-25T001319Z.csv exists...
20220225183832.snapshot
2022-02-25T175042Z
../assets/2022-02-25T175042Z.csv exists...
20220225195302.snapshot
2022-02-25T194828Z
../assets/2

In [299]:
Oryx.list_href[65]

[<a href="https://i.postimg.cc/8c0jfFn1/b6.png">(1, destroyed)</a>,
 <a href="https://i.postimg.cc/B65zvHbK/b6.png">(2 and 3, destroyed)</a>,
 <a href="https://i.postimg.cc/x16gjzR1/442.png">(4, destroyed)</a>,
 <a href="https://i.postimg.cc/1t68G6Ls/Screenshot-8400.png">(6, abandoned)</a>,
 <a href="https://i.postimg.cc/s2wR9DPH/768.png">(7, damaged and captured)</a>,
 <a href="https://i.postimg.cc/xCF0twG5/b6.png">(8, captured)</a>,
 <a href="https://i.postimg.cc/DZ0Q5pHc/6652.png">(9, 10 and 11, captured)</a>]

In [300]:
Oryx.list_link[65].insert(4, 'NA')

In [301]:
Oryx.list_link[65].extend([Oryx.list_link[65][-1]] * 2)

In [302]:
len(Oryx.list_link[65])

11

In [303]:
## remove 2 duplicates in entries
n = 97
temp_idx = []
temp_status = []
temp_link = []

for i, j in enumerate(Oryx.list_idx[n]):
    if j not in temp_idx:
        temp_idx.append(j)
        temp_status.append(Oryx.list_status[n][i])
        temp_link.append(Oryx.list_link[n][i])
    else:
        print(j)

# Oryx.list_idx[n] = temp_idx
# Oryx.list_status[n] = temp_status
# Oryx.list_link[n] = temp_link

34
40


In [282]:
Oryx.list_count[97] = 72

In [278]:
Oryx.list_count[104] = 2

In [279]:
Oryx.list_count[134] = 1

In [285]:
Oryx.find_error()

97
73 KamAZ 6x6: (1, destroyed by Bayraktar TB2) (2, destroyed by Bayraktar TB2) (3 and 4, destroyed by Bayraktar TB2) (5, destroyed by Bayraktar TB2) (6, destroyed) (7, destroyed) (8, destroyed) (9, destroyed) (10, destroyed) (11, destroyed) (12, destroyed) (13, destroyed) (14, destroyed) (15, destroyed) (16, destroyed) (17, destroyed) (18, destroyed) (19, destroyed) (20, destroyed) (21, destroyed) (22, destroyed) (23 and 24, destroyed) (25, destroyed) (26, destroyed) (27, destroyed) (28, destroyed) (29, destroyed) (30, destroyed) (31, destroyed) (32, destroyed) (33, destroyed) (34, destroyed) (34, destroyed) (35, destroyed) (36, destroyed) (37, destroyed) (38, destroyed) (39, damaged) (40, damaged) (41, abandoned) (42, abandoned) (43, abandoned) (44, abandoned) (45, abandoned) (46, abandoned) (48, abandoned) (49, abandoned) (40 and 50, abandoned) (51, abandoned) (52, abandoned) (53, abandoned) (54, abandoned) (55, damaged and captured) (56, captured) (57, captured) (58, captured) (59

[97]

In [291]:
[i for i in range(1, 73) if i not in Oryx.list_idx[97]]

[47]

In [271]:
Oryx.add_groups()
Oryx.flatten_lists()

print(len(Oryx.flat_name))
print(len(Oryx.flat_status))
print(len(Oryx.flat_index))
print(len(Oryx.flat_group))
print(len(Oryx.flat_country))
print(len(Oryx.flat_link))
print()

temp = input()
if temp == 'stop':
    print(temp)
else:
    df = pd.DataFrame({'name':Oryx.flat_name, 
                       'status':Oryx.flat_status, 
                       'idx': Oryx.flat_index, 
                       'group':Oryx.flat_group, 
                       'country':Oryx.flat_country, 
                       'link': Oryx.flat_link})

    df.to_csv(path, index=False)

713
713
713
713
713
713


