In [1]:
## load packages
from bs4 import BeautifulSoup
import requests
import re
import os
import numpy as np
import pandas as pd

# Import and process

In [2]:
class Data:
    def __init__(self, url):
        self.url = url
        ## names of items
        self.list_name = []
        ## status of items
        self.list_status = []
        ## total number of items
        self.list_count = []
        ## index of item for status
        self.list_idx = []
        ## link to confirmation
        self.list_link = []
    
    def get_soup(self):
        page = requests.get(self.url)
        self.soup = BeautifulSoup(page.content, 'html.parser')
    
    def get_tags(self):
        ## needed for extract_info
        li = self.soup.findAll('li')
        self.list_total = [i.text.strip() for i in li if i.find('img', class_='thumbborder') is not None]
        self.list_href = [i.findAll('a', href=True) for i in li if i.find('img', class_='thumbborder') is not None]
        ## needed for add_groups
        self.list_h3 = self.soup.findAll('h3')
        self.list_tags = [tag.name for tag in self.soup.find_all(True)]
        
    def extract_info(self):
        for idx, entry in enumerate(self.list_total):
            ## take left of ':', remove leading/trailing space, split on first ' '
            temp1 = entry.split(':')[0].strip().split(' ', 1)

            ## turn string at front into int
            try:
                self.list_count.append(int(temp1[0]))
            ## some have extra char, so take 1st str if so
            except:
                self.list_count.append(int(temp1[0][0]))

            ## add item name
            self.list_name.append(temp1[1])

            ## take right of ':'
            temp2 = entry.split(':')[1].strip()
            ## take data between '()'
            temp2 = re.findall('\(.*?\)',temp2)
            ## split on last space to the right, remove '(' and ')'
            temp2 = [i.replace('(', '').replace(')', '').rsplit(' ', 1) for i in temp2]
            
            count = []
            status = []
            link = []

            for i, val in enumerate(temp2):
                ## some numbers concatenated together so find all numbers
                num = [int(s) for s in re.findall(r'\s|,|[^,\s]+', val[0]) if s.isdigit()]
                count.extend(num)
                ## get data after last space to the right
                status.extend([val[1].strip()] * len(num))
                ## add link to match the number of occurrences
                link.extend([self.list_href[idx][i]['href']] * len(num))

            self.list_idx.append(count)
            self.list_status.append(status)
            self.list_link.append(link)

    def flatten_lists(self):
        names = [[self.list_name[i]] * len(j) for i, j in enumerate(self.list_status)]
        self.flat_name = [item for sublist in names for item in sublist]
        self.flat_status = [item for sublist in self.list_status for item in sublist]
        self.flat_index = [item for sublist in self.list_idx for item in sublist]
        self.flat_link = [item for sublist in self.list_link for item in sublist]
        self.flat_group = [item for sublist in self.list_groups for item in sublist]
        self.flat_country = [item for sublist in self.list_countries for item in sublist]
            
    def add_groups(self):
        ## find relative indices where text != '\n'
        idx_h3 = [m for m, n in enumerate(self.list_h3) if n.text != '\n']
        ## find text where text != '\n'
        name_h3 = [n.text for n in self.list_h3 if n.text != '\n']

        ## find all tags
        ## absolute find index of all h3 tags
        idx_tag = [idx for idx, tag in enumerate(self.list_tags) if tag == 'h3']
        ## retain only absolute indices where text != '\n'
        idx_h3 = [idx_tag[i] for i in idx_h3]

        ## find index of li tags
        idx_li = [n for n, tag in enumerate(self.list_tags) if tag == 'li']

        ## find text for all tags
        text = [name.text for name in self.soup.find_all(True)]

        ## find top-level tags where 'Russia' and 'Ukraine' occur
        idx_rus = [i for i in idx_h3 if 'Russia' in text[i]][0]
        idx_ukr = [i for i in idx_h3 if 'Ukraine' in text[i]][0]
        
        ## find all headings outside of those that contain 'Russia' and 'Ukraine'
        categories = [text[i] for i in idx_h3 if i not in [idx_rus, idx_ukr]]
        ## take info from heading before first '('
        name_cat = [entry.split('(')[0].strip() for entry in categories]
        
        list_indices = []
        list_country = []

        for i in range(len(idx_h3) - 1):
            if idx_h3[i] not in [idx_rus, idx_ukr]:
                list_indices.append([idx for idx in idx_li if idx > idx_h3[i] and idx < idx_h3[i + 1]])
                if idx_h3[i] > idx_ukr:
                    list_country.append('UKR')
                else:
                    list_country.append('RUS')

        ## for last entry
        list_country.append('UKR')

        idx_names = np.cumsum([len(i) for i in list_indices])

        group_names = []

        group_names.append(self.list_name[0:idx_names[0]])

        for i in range(len(idx_names) - 1):
            group_names.append(self.list_name[idx_names[i]:idx_names[i + 1]])

        ## for last entry
        last_entry = self.list_name[idx_names[-1]:-1]
        last_entry.append(self.list_name[-1])
        group_names.append(last_entry)
        
        i = 0

        ## iterate over list count to get individual entries
        self.list_groups = []
        self.list_countries = []

        for idx, groups in enumerate(group_names):
            for group in groups:
                self.list_groups.append([name_cat[idx]] * self.list_count[i])
                self.list_countries.append([list_country[idx]] * self.list_count[i])
                i += 1

    def find_error(self):
        list_range = []
        for j in self.list_count:
            list_range.append([i for i in range(1, j + 1)])

        idx_wrong = []
        for idx, entry in enumerate(list_range):
            if entry != self.list_idx[idx]:
                idx_wrong.append(idx)

        for i in idx_wrong:
            print(i)
            print(self.list_total[i])
            print(self.list_idx[i])
            print(len(self.list_idx[i]))
            print(self.list_count[i])
            print()

        return idx_wrong

In [3]:
url = 'https://www.oryxspioenkop.com/2022/02/attack-on-europe-documenting-equipment.html'
Oryx = Data(url)

Oryx.get_soup()
Oryx.get_tags()
Oryx.extract_info()

# Fix errors

In [4]:
idx_wrong = Oryx.find_error()

83
6 152mm 2A65 Msta-B howitzer: (1, destroyed by Bayraktar TB2) (2, destroyed by Bayraktar TB2) (3, destroyed by Bayraktar TB2) (4, damaged by Bayraktar TB2) (6, damaged by Bayraktar TB2) (6, abandoned)
[1, 2, 3, 4, 6, 6]
6
6

91
2 220mm BM-27 'Uragan': (1, destroyed by Bayraktar TB2) (1, abandoned and destroyed)
[1, 1]
2
2

103
4 9A330 Tor TLAR (for 9K330 Tor-M): (1, abandoned) (2, damaged and abandoned) (2, captured) (3, captured)
[1, 2, 2, 3]
4
4

138
165 KamAZ 6x6: (1, destroyed by Bayraktar TB2) (2, destroyed by Bayraktar TB2) (3 and 4, destroyed by Bayraktar TB2) (5, destroyed by Bayraktar TB2) (6, destroyed) (7, destroyed) (8, destroyed) (9, destroyed) (10, destroyed) (11, destroyed) (12, destroyed) (13, destroyed) (14, destroyed) (15, destroyed) (16, destroyed) (17, destroyed) (18, destroyed) (19, destroyed) (20, destroyed) (21, destroyed) (22, destroyed) (23 and 24, destroyed) (25, destroyed) (26, destroyed) (27, destroyed) (28, destroyed) (29, destroyed) (30, destroyed) (31,

In [5]:
## fix errors
Oryx.list_idx[83][4] = 5
Oryx.list_idx[91][-1] = 2
Oryx.list_idx[103][2] = 3
Oryx.list_idx[103][3] = 4

In [6]:
## remove 2 duplicates in entries
n = 138
temp_idx = []
temp_status = []
temp_link = []

for i, j in enumerate(Oryx.list_idx[n]):
    if j not in temp_idx:
        temp_idx.append(j)
        temp_status.append(Oryx.list_status[n][i])
        temp_link.append(Oryx.list_link[n][i])
    else:
        print(j)

Oryx.list_idx[n] = temp_idx
Oryx.list_status[n] = temp_status
Oryx.list_link[n] = temp_link

34
56


In [7]:
## fix errors caused by duplicate numbers
n = 218

idx_wrong = [2, 4, 6, 8]

Oryx.list_idx[n] = [j for i, j in enumerate(Oryx.list_idx[n]) if i not in idx_wrong]
Oryx.list_status[n] = [j for i, j in enumerate(Oryx.list_status[n]) if i not in idx_wrong]
Oryx.list_link[n] = [j for i, j in enumerate(Oryx.list_link[n]) if i not in idx_wrong]

In [9]:
Oryx.find_error()

[]

# Add groups

In [10]:
Oryx.add_groups()

In [11]:
Oryx.flatten_lists()

In [12]:
print(len(Oryx.flat_name))
print(len(Oryx.flat_status))
print(len(Oryx.flat_index))
print(len(Oryx.flat_group))
print(len(Oryx.flat_country))
print(len(Oryx.flat_link))

1890
1890
1890
1890
1890
1890


In [13]:
df = pd.DataFrame({'name':Oryx.flat_name, 
                   'status':Oryx.flat_status, 
                   'idx': Oryx.flat_index, 
                   'group':Oryx.flat_group, 
                   'country':Oryx.flat_country, 
                   'link': Oryx.flat_link})

In [14]:
df[['group', 'country']].value_counts()

group                              country
Trucks, Vehicles and Jeeps         RUS        425
Tanks                              RUS        232
Infantry Fighting Vehicles         RUS        207
Armoured Fighting Vehicles         RUS        143
Trucks, Vehicles and Jeeps         UKR         87
Armoured Personnel Carriers        RUS         68
Tanks                              UKR         66
Anti-Tank Guided Missiles          RUS         61
Anti-tank Guided Missiles          UKR         50
Armoured Fighting Vehicles         UKR         49
Engineering Vehicles               RUS         43
Infantry Fighting Vehicles         UKR         42
Infantry Mobility Vehicles         RUS         40
Helicopters                        RUS         32
Self-Propelled Artillery           RUS         32
Surface-To-Air Missile Systems     RUS         32
Towed Artillery                    RUS         31
Multiple Rocket Launchers          RUS         25
Infantry Mobility Vehicles         UKR         24
Man-Por

In [15]:
df.group.unique()

array(['Tanks', 'Armoured Fighting Vehicles',
       'Infantry Fighting Vehicles', 'Armoured Personnel Carriers',
       'Mine-Resistant Ambush Protected', 'Infantry Mobility Vehicles',
       'Communications Stations', 'Engineering Vehicles',
       'Anti-Tank Guided Missiles', 'Man-Portable Air Defence Systems',
       'Heavy Mortars', 'Towed Artillery', 'Self-Propelled Artillery',
       'Multiple Rocket Launchers', 'Anti-Aircraft Guns',
       'Self-Propelled Anti-Aircraft Guns',
       'Surface-To-Air Missile Systems', 'Radars',
       'Jammers And Deception Systems', 'Aircraft',
       'Unmanned Aerial Vehicles', 'Helicopters', 'Logistics Trains',
       'Trucks, Vehicles and Jeeps', 'Anti-tank Guided Missiles',
       'Naval Ships'], dtype=object)

In [16]:
## save by datemod meta tag
datemod = str(Oryx.soup.findAll('meta', itemprop="dateModified", content=True))
datemod = re.findall('"([^"]*)"', datemod)[0].replace(':', '')
print(datemod)

2022-03-16T193938Z


In [17]:
## output dataframe to CSV
# cwd = os.getcwd()
path = 'assets/' + datemod + '.csv'
df.to_csv(path, index=False)

In [18]:
print(datemod)

2022-03-16T193938Z


# NOT IN USE

In [30]:
num = []

for idx, cat in enumerate(categories):
    try:
        num.append(int(re.findall('\(.*?\,',cat)[0].replace('(', '').replace(',', '')))
    except:
        #print(idx)
        ## custom designed to fix this error - print idx if more arise
        num.append(int(cat.split('(')[-1].split(',')[0]))

In [31]:
temp = [len(j) for j in Oryx.list_status]

idx = 0
test = []

for i in group_names:
    test.append(sum(temp[idx:idx + len(i)]))
    idx += len(i)

In [32]:
sum(num)

1878

In [33]:
len(Oryx.list_link)

231

In [34]:
sum(Oryx.list_count)

1879

In [35]:
sum(temp[0:len(group_names[0])])

229

In [114]:
test

[229,
 140,
 204,
 68,
 11,
 40,
 9,
 41,
 61,
 24,
 5,
 31,
 32,
 25,
 2,
 9,
 32,
 1,
 2,
 13,
 8,
 32,
 2,
 418,
 65,
 49,
 42,
 22,
 24,
 1,
 50,
 16,
 15,
 10,
 3,
 1,
 1,
 11,
 5,
 9,
 6,
 1,
 17,
 87]

In [36]:
idx_wrong = [i for i, j in enumerate(test) if j != num[i]]

In [37]:
idx_wrong

[23, 25, 43]

In [38]:
[group_names[i] for i in idx_wrong]

[['GAZ-66',
  'ZiL-131',
  "transporter-loader (for BM-27 'Uragan' MRL)",
  '9T217 transloader (for 9K33 Osa)',
  'GAZ-3308',
  'GAZ Sobol',
  'Ural-4320',
  'Ural-43206',
  'Ural Federal',
  'Ural-63704-0010 Tornado-U',
  'Ural-542301 tank transporter',
  'KamAZ 4x4',
  'KamAZ 6x6',
  'KamAZ Avtozaks',
  'KamAZ with armoured cabin',
  'KamAZ-6350 8x8 artillery tractor',
  'UAZ-469',
  'UAZ Patriot',
  'UAZ-23632',
  'Armoured SUV',
  'Unknown truck',
  '(Unknown) vehicle'],
 ['BMP-1Khs',
  'BRM-1K',
  'BRDM-2',
  '9P148 Konkurs ATGM carrier',
  'MT-LB',
  'MT-LB with ZU-23 AA gun',
  'MT-LB Ambulance',
  'MT-LBu',
  'BSEM-4K',
  'BREM-1 ARV',
  'Vepr MRAP',
  "1V18 'Klyon-1' artillery command and forward observer vehicle",
  'SNAR-10 battlefield surveillance radar',
  'Unknown AFV'],
 ['KrAZ-6322',
  'KrAZ-5233',
  "transporter-loader (for BM-27 'Uragan' MRL)",
  'GAZ-66',
  'ZiL-131',
  'Ural-375D',
  'Ural-4320',
  'KamAZ',
  'MAZ-537',
  'MAZ',
  'UAZ-469',
  'UAZ-452',
  'Unknown 

In [39]:
n_test = [test[i] for i in idx_wrong]

In [40]:
n_test

[422, 49, 87]

In [41]:
n_num = [num[i] for i in idx_wrong]

In [42]:
n_num

[425, 47, 85]

In [43]:
sum(i - j for i, j in zip(n_test, n_num))

1